### 5 news group

In [None]:
import pandas as pd
import os
import numpy as np


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


In [1]:
!git clone https://github.com/kamalkraj/BERT-NER.git


In [None]:
cd BERT-NER/

In [None]:
!pip3 install -r requirements.txt

In [47]:
!python run_ner.py --data_dir=data/ --bert_model=bert-base-cased --task_name=ner --output_dir=out_ner --max_seq_length=128 --do_train --num_train_epochs 3 --do_eval --warmup_proportion=0.1

02/13/2021 20:54:15 - INFO - __main__ -   device: cuda n_gpu: 1, distributed training: False, 16-bits training: False
02/13/2021 20:54:15 - INFO - pytorch_transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt not found in cache or force_download set to True, downloading to /tmp/tmpj2x71kof
100%|████████████████████████████████| 213450/213450 [00:00<00:00, 796128.35B/s]
02/13/2021 20:54:16 - INFO - pytorch_transformers.file_utils -   copying /tmp/tmpj2x71kof to cache at /tmp/xdg-cache/torch/pytorch_transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
02/13/2021 20:54:16 - INFO - pytorch_transformers.file_utils -   creating metadata file for /tmp/xdg-cache/torch/pytorch_transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
02/13/2021 20:54:16 - INFO - pytorch_t

In [4]:
pwd

'/home/sid015/DSC180'

In [49]:
%%writefile bert.py
"""BERT NER Inference."""

from __future__ import absolute_import, division, print_function

import json
import os

import torch
import torch.nn.functional as F
from nltk import word_tokenize
from pytorch_transformers import (BertConfig, BertForTokenClassification,
                                  BertTokenizer)


class BertNer(BertForTokenClassification):

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, valid_ids=None):
        sequence_output = self.bert(input_ids, token_type_ids, attention_mask, head_mask=None)[0]
        batch_size,max_len,feat_dim = sequence_output.shape
        valid_output = torch.zeros(batch_size,max_len,feat_dim,dtype=torch.float32,device='cuda' if torch.cuda.is_available() else 'cpu')
        for i in range(batch_size):
            jj = -1
            for j in range(max_len):
                    if valid_ids[i][j].item() == 1:
                        jj += 1
                        valid_output[i][jj] = sequence_output[i][j]
        sequence_output = self.dropout(valid_output)
        logits = self.classifier(sequence_output)
        return logits

class Ner:

    def __init__(self,model_dir: str):
        self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
        self.label_map = self.model_config["label_map"]
        self.max_seq_length = self.model_config["max_seq_length"]
        self.label_map = {int(k):v for k,v in self.label_map.items()}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)
        self.model.eval()

    def load_model(self, model_dir: str, model_config: str = "model_config.json"):
        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
        model = BertNer.from_pretrained(model_dir)
        tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=model_config["do_lower"])
        return model, tokenizer, model_config

    def tokenize(self, text: str):
        """ tokenize input"""
        words = word_tokenize(text)
        tokens = []
        valid_positions = []
        for i,word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def preprocess(self, text: str):
        """ preprocess """
        tokens, valid_positions = self.tokenize(text)
        ## insert "[CLS]"
        tokens.insert(0,"[CLS]")
        valid_positions.insert(0,1)
        ## insert "[SEP]"
        tokens.append("[SEP]")
        valid_positions.append(1)
        segment_ids = []
        for i in range(len(tokens)):
            segment_ids.append(0)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            valid_positions.append(0)
        return input_ids,input_mask,segment_ids,valid_positions

    def predict(self, text: str):
        input_ids,input_mask,segment_ids,valid_ids = self.preprocess(text)
        input_ids = torch.tensor([input_ids],dtype=torch.long,device=self.device)
        input_mask = torch.tensor([input_mask],dtype=torch.long,device=self.device)
        segment_ids = torch.tensor([segment_ids],dtype=torch.long,device=self.device)
        valid_ids = torch.tensor([valid_ids],dtype=torch.long,device=self.device)
        with torch.no_grad():
            logits = self.model(input_ids, segment_ids, input_mask,valid_ids)
        logits = F.softmax(logits,dim=2)
        logits_label = torch.argmax(logits,dim=2)
        logits_label = logits_label.detach().cpu().numpy().tolist()[0]

        logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label)]

        logits = []
        pos = 0
        for index,mask in enumerate(valid_ids[0]):
            if index == 0:
                continue
            if mask == 1:
                logits.append((logits_label[index-pos],logits_confidence[index-pos]))
            else:
                pos += 1
        logits.pop()

        labels = [(self.label_map[label],confidence) for label,confidence in logits]
        words = word_tokenize(text)
        assert len(labels) == len(words)

        Person = []
        Location = []
        Organization = []
        Miscelleneous = []

        for word, (label, confidence) in zip(words, labels):
            if label=="B-PER" or label=="I-PER":
                Person.append(word)
            elif label=="B-LOC" or label=="I-LOC":
                Location.append(word)
            elif label=="B-ORG" or label=="I-ORG":
                Organization.append(word)
            elif label=="B-MISC" or label=="I-MISC":
                Miscelleneous.append(word)
            else:
                output = None

        output = []
        for word, (label, confidence) in zip(words, labels):      
            if label == "B-PER":
                output.append(' '.join(Person) + ": Person")
            if label=="B-LOC":
                output.append(' '.join(Location) + ": Location")
            if label=="B-MISC":
                output.append(' '.join(Miscelleneous) + ": Miscelleneous Entity")
            if label=="B-ORG":
                output.append(' '.join(Organization) + ": Organization")
                
        return output

Overwriting bert.py


In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/sid015/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
cd /home/sid015/DSC180

/home/sid015/DSC180


In [6]:
tf=pd.read_csv('summary_sentences.csv')


In [2]:
cd /home/sid015/DSC180/BERT-NER

/home/sid015/DSC180/BERT-NER


In [2]:

from bert import Ner
model = Ner("out_ner/")



In [15]:
tf.summary_sentences

0        TimeWarner said fourth quarter sales rose 2% t...
1        For the full-year, TimeWarner posted a profit ...
2        Quarterly profits at US media giant TimeWarner...
3        However, the company said AOL's underlying pro...
4        Its profits were buoyed by one-off gains which...
                               ...                        
17716    I am actually convinced at this point that the...
17717    Scare-mongering articles about "addictive vide...
17718    When I was made redundant I told my partner I ...
17719    And more and more so I find I have less and le...
17720    They are all engineered in such a way that ear...
Name: summary_sentences, Length: 17721, dtype: object

In [11]:
ne=[]
for sentence in tf.summary_sentences.values:
    ne.append(model.predict(sentence))

In [16]:
tf['NE']=ne

In [44]:
for i in ne: 
    for j in range(len(i)):
        i[j]=i[j].split(':')[0]

In [55]:
s=tf[['doc','type_code','NE']]
s

Unnamed: 0,doc,type_code,NE
0,0,1,[TimeWarner]
1,0,1,[TimeWarner]
2,0,1,"[US, TimeWarner]"
3,0,1,[AOL]
4,0,1,"[Warner Bros AOL, Warner Bros AOL]"
...,...,...,...
17716,2224,5,[MMOPRGs]
17717,2224,5,"[Pong, Atari]"
17718,2224,5,[EverQuest]
17719,2224,5,[]


In [102]:
docs=[]
marker=[]
sets=set()
for i in range(1,len(s.doc.values)):
    if s.doc.values[i]==s.doc.values[i-1]:
        for j in set(s.NE.values[i-1]):
            sets.add(j)
    else:
        for j in set(s.NE.values[i-1]):
            sets.add(j)
        docs.append(list(sets))
        marker.append(s.doc.values[i-1])
        sets=set()
docs.append(list(sets))

In [103]:
len(docs)

2225

In [104]:
df=pd.read_csv('all_data.csv')
df['NE']=docs
df

Unnamed: 0.1,Unnamed: 0,text,type,summary,type_code,NE
0,0,Ad sales boost Time Warner profitQuarterly pro...,business,TimeWarner said fourth quarter sales rose 2% t...,1,"[TimeWarner, Warner Bros AOL, AOL, Time Warner..."
1,1,Dollar gains on Greenspan speechThe dollar has...,business,The dollar has hit its highest level against t...,1,"[China US, Robert Sinche, US US, New York, Ban..."
2,2,Yukos unit buyer faces loan claimThe owners of...,business,Yukos' owner Menatep Group says it will ask Ro...,1,"[Yugansk, Yugansk Rosneft, Moscow-based US, Me..."
3,3,High fuel prices hit BA's profitsBritish Airwa...,business,"Rod Eddington, BA's chief executive, said the ...",1,"[Rod Eddington, Mike Powell, Martin Broughton,..."
4,4,Pernod takeover talk lifts DomecqShares in UK ...,business,Pernod has reduced the debt it took on to fund...,1,"[Allied Domecq Pernod, Pernod Seagram Allied, ..."
...,...,...,...,...,...,...
2220,2220,BT program to beat dialler scamsBT is introduc...,tech,BT is introducing two initiatives to help beat...,5,"[BT, BT BT, UK, Icstis]"
2221,2221,Spam e-mails tempt net shoppersComputer users ...,tech,A third of them read unsolicited junk e-mail a...,5,[Brazilians]
2222,2222,Be careful how you codeA new European directiv...,tech,This goes to the heart of the European project...,5,"[European, Dutch European, Amazon, Directive o..."
2223,2223,US cyber security chief resignsThe man making ...,tech,Amit Yoran was director of the National Cyber ...,5,"[9/11, Yoran, National Cyber Security Division..."


In [113]:
for_vec=df.explode('NE')

In [115]:
for_vec

Unnamed: 0.1,Unnamed: 0,text,type,summary,type_code,NE
0,0,Ad sales boost Time Warner profitQuarterly pro...,business,TimeWarner said fourth quarter sales rose 2% t...,1,TimeWarner
0,0,Ad sales boost Time Warner profitQuarterly pro...,business,TimeWarner said fourth quarter sales rose 2% t...,1,Warner Bros AOL
0,0,Ad sales boost Time Warner profitQuarterly pro...,business,TimeWarner said fourth quarter sales rose 2% t...,1,AOL
0,0,Ad sales boost Time Warner profitQuarterly pro...,business,TimeWarner said fourth quarter sales rose 2% t...,1,Time Warner
0,0,Ad sales boost Time Warner profitQuarterly pro...,business,TimeWarner said fourth quarter sales rose 2% t...,1,US
...,...,...,...,...,...,...
2224,2224,Losing yourself in online gamingOnline role pl...,tech,He says that in the world of online gaming suc...,5,Ultima Online
2224,2224,Losing yourself in online gamingOnline role pl...,tech,He says that in the world of online gaming suc...,5,European
2224,2224,Losing yourself in online gamingOnline role pl...,tech,He says that in the world of online gaming suc...,5,EverQuest Ultima
2224,2224,Losing yourself in online gamingOnline role pl...,tech,He says that in the world of online gaming suc...,5,England


In [117]:
text_for_vect = ' '.join(for_vec.NE.apply(lambda x: str(x)))
count_vect = CountVectorizer()
count_vect = count_vect.fit([text_for_vect])
len(count_vect.vocabulary_)

8084

In [119]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.type_code, test_size=0.33, random_state=42)

X_count = count_vect.transform(X_train)

clf = LogisticRegression(max_iter = 100).fit(X_count,y_train)

In [121]:
docs_test = count_vect.transform(X_test)
predicted = clf.predict(docs_test)
np.mean(predicted == y_test)


0.9578231292517007

In [25]:
tf.to_csv('Summary_Sentences_NE.csv',index=False)

In [1]:
cd /home/sid015/DSC180/BERT-NER

/home/sid015/DSC180/BERT-NER


### 20 News Group Data Set

In [2]:



# datasets = load_dataset("conll2003")
# import pandas as pd
# import numpy as np
# df=pd.concat([pd.DataFrame(datasets['train']),(pd.DataFrame(datasets['validation'])),(pd.DataFrame(datasets['test']))])

In [4]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])" 
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "i.e." in text: text = text.replace("i.e.","i<prd>e<prd>")
    if "e.g." in text:text = text.replace("e.g.","e<prd>g<prd>")
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [5]:
from sklearn.datasets import fetch_20newsgroups
import json

from sklearn.datasets import fetch_20newsgroups
def regex_condi(string):
    if ".com" in string:
        return False
    if ".edu" in string:
        return False
    if "@" in string:
        return False
    if 'Host'in string:
        return False
    if ".gov" in string:
        return False

    return True


def clean_text(string):
    string = " ".join([i for i in string.split() if regex_condi(i)])
    string = re.sub(r"Re:","", string)
    string = re.sub(r"Reply-To:","", string)

    string_lst = string.split(': ')
    
    
    toreturn = ""
    for i in np.arange(len(string_lst) - 1):
        if "Subject" in string_lst[i]:
            temp = string_lst[i+1].split(' ')
            toreturn += " " + ' '.join(temp[:-1])
        if "Keywords" in string_lst[i]:
            temp = string_lst[i+1].split(' ')
            toreturn += " " + ' '.join(temp[:-1])
        if "Organization" in string_lst[i]:
            temp = string_lst[i+1].split(' ')
            toreturn += " " + ' '.join(temp[:-1])
            
        if "Lines" in string_lst[i]:
            temp = ""
            for j in range(i+1,len(string_lst)):
                temp += string_lst[j]
            toreturn += temp
            break

    toreturn = re.sub(r"\'", "", toreturn)
    toreturn = re.sub(r"\>", "", toreturn)
    toreturn = re.sub(r"\:", "", toreturn)
    toreturn = re.sub(r"^[0-9]+ ", "", toreturn)
    toreturn = re.sub(r"\/", ' ',toreturn)
    toreturn = re.sub(r"\-{2,}", ' ',toreturn)
    toreturn = re.sub(r"\s+", ' ',toreturn)
    toreturn = toreturn.strip()
    return toreturn

In [6]:
newsgroups_train = fetch_20newsgroups(subset='train')
news_df = pd.DataFrame.from_dict(newsgroups_train,'index').T
news_df

Unnamed: 0,data,filenames,target_names,target,DESCR
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,/home/sid015/scikit_learn_data/20news_home/20n...,alt.atheism,7,.
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,/home/sid015/scikit_learn_data/20news_home/20n...,comp.graphics,4,.
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,/home/sid015/scikit_learn_data/20news_home/20n...,comp.os.ms-windows.misc,4,
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,/home/sid015/scikit_learn_data/20news_home/20n...,comp.sys.ibm.pc.hardware,1,_
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,/home/sid015/scikit_learn_data/20news_home/20n...,comp.sys.mac.hardware,14,2
...,...,...,...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,/home/sid015/scikit_learn_data/20news_home/20n...,,13,
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,/home/sid015/scikit_learn_data/20news_home/20n...,,4,
11311,From: westes@netcom.com (Will Estes)\nSubject:...,/home/sid015/scikit_learn_data/20news_home/20n...,,3,
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,/home/sid015/scikit_learn_data/20news_home/20n...,,1,


In [7]:
news_df.data = news_df.data.apply(lambda x: clean_text(x)+ '.')
news_df=news_df.reset_index()

In [8]:
news_df['summary_sentences']=news_df.data.apply(lambda x:split_into_sentences(x))
# df['text_sentences']=df.summary.apply(lambda x:split_into_sentences(x))
df_summary_sentences=news_df.summary_sentences.explode().reset_index()
df_summary_sentences=df_summary_sentences.merge(news_df[['index','target']], on="index", how = 'left')


In [9]:
df_summary_sentences.summary_sentences

0                                         WHAT car is this!
1                                                         ?
2         University of Maryland, College Park15 I was w...
3         It was a 2-door sports car, looked to be from ...
4                                 It was called a Bricklin.
                                ...                        
257812                                   Guess Ill miss it.
257813                           -((( Help me find my baby!
257814                                                    !
257815                                                    !
257816                                                 kjg.
Name: summary_sentences, Length: 257817, dtype: object

In [57]:
from tqdm import tqdm

for sentence in tqdm(df_summary_sentences.summary_sentences.values[170792:]):
    try:
        if len(sentence)<512:
            ne2.append(model.predict(sentence))
        else:
            ne2.append([])
    except (AssertionError,TypeError):
        ne2.append([])









  0%|          | 0/87025 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







  0%|          | 4/87025 [00:00<40:07, 36.14it/s][A[A[A[A[A[A[A[A







  0%|          | 8/87025 [00:00<41:24, 35.03it/s][A[A[A[A[A[A[A[A







  0%|          | 12/87025 [00:00<41:32, 34.92it/s][A[A[A[A[A[A[A[A







  0%|          | 16/87025 [00:00<42:08, 34.41it/s][A[A[A[A[A[A[A[A







  0%|          | 20/87025 [00:00<41:29, 34.95it/s][A[A[A[A[A[A[A[A







  0%|          | 25/87025 [00:00<39:26, 36.77it/s][A[A[A[A[A[A[A[A







  0%|          | 30/87025 [00:00<37:34, 38.59it/s][A[A[A[A[A[A[A[A







  0%|          | 35/87025 [00:00<36:53, 39.30it/s][A[A[A[A[A[A[A[A







  0%|          | 39/87025 [00:01<40:11, 36.07it/s][A[A[A[A[A[A[A[A







  0%|          | 43/87025 [00:01<41:10, 35.21it/s][A[A[A[A[A[A[A[A







  0%|          | 47/87025 [00:01<43:13, 33.53it/s][A[A[A[A[A[A[A[A







  0%|          | 51

In [60]:
df_summary_sentences['NE']=ne2

In [63]:
df_summary_sentences

Unnamed: 0,index,summary_sentences,target,NE
0,0,WHAT car is this!,7,[]
1,0,?,7,[]
2,0,"University of Maryland, College Park15 I was w...",7,"[University of Maryland, College Park15]"
3,0,"It was a 2-door sports car, looked to be from ...",7,[]
4,0,It was called a Bricklin.,7,[Bricklin]
...,...,...,...,...
257812,11313,Guess Ill miss it.,8,[Ill]
257813,11313,-((( Help me find my baby!,8,[]
257814,11313,!,8,[]
257815,11313,!,8,[]


In [62]:
for i in ne2: 
    for j in range(len(i)):
        i[j]=i[j].split(':')[0]

In [71]:
s=df_summary_sentences[['index','target','NE']]
s=s.rename(columns={'index':'doc'})
s

Unnamed: 0,doc,target,NE
0,0,7,[]
1,0,7,[]
2,0,7,"[University of Maryland, College Park15]"
3,0,7,[]
4,0,7,[Bricklin]
...,...,...,...
257812,11313,8,[Ill]
257813,11313,8,[]
257814,11313,8,[]
257815,11313,8,[]


In [72]:
docs=[]
marker=[]
sets=set()
for i in range(1,len(s.doc.values)):
    if s.doc.values[i]==s.doc.values[i-1]:
        for j in set(s.NE.values[i-1]):
            sets.add(j)
    else:
        for j in set(s.NE.values[i-1]):
            sets.add(j)
        docs.append(list(sets))
        marker.append(s.doc.values[i-1])
        sets=set()
docs.append(list(sets))

In [73]:
len(docs)

11314

In [68]:
docs

[[],
 [],
 ['College Park15', 'University of Maryland'],
 [],
 ['Bricklin'],
 [],
 [],
 [],
 [],
 ['Lerxst', 'IL'],
 ['University of Washington11'],
 [],
 [],
 [],
 [],
 ['Guy Kuo'],
 [],
 [],
 [],
 ['Purdue University Engineering Computer Network36'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Tom Willis', 'Purdue Electrical Engineering'],
 ['F. W. Nietzsche'],
 ['Weitek P9000'],
 ['Weitek P9000',
  'Harris Computer Systems Division14 Distributionworld X-NewsreaderTIN',
  'Robert J.C. Kyanko'],
 [],
 [],
 ['Weiteks'],
 [],
 ['Joe Green Harris Corporation Computer Systems Division'],
 ['Jonathan Winters'],
 ['Smithsonian Observatory Cambridge MA USA23',
  'Shuttle',
  'Tom A Baker Pack Rat'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Jonathan'],
 ['Second Amendment'],
 ['D. Tavares Foxvog C'],
 ['D. Tavares John Lawrence Rutledge'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Doug Foxvog'],
 ['SKS

In [74]:
news_df['NE']=docs
news_df

Unnamed: 0,index,data,filenames,target_names,target,DESCR,summary_sentences,NE
0,0,"WHAT car is this!? University of Maryland, Col...",/home/sid015/scikit_learn_data/20news_home/20n...,alt.atheism,7,.,"[WHAT car is this!, ?, University of Maryland,...","[Bricklin, University of Maryland, College Par..."
1,1,"SI Clock Poll - Final Call SI,acceleration,clo...",/home/sid015/scikit_learn_data/20news_home/20n...,comp.graphics,4,.,"[SI Clock Poll - Final Call SI,acceleration,cl...","[University of Washington11, Guy Kuo]"
2,2,PB questions... Purdue University Engineering ...,/home/sid015/scikit_learn_data/20news_home/20n...,comp.os.ms-windows.misc,4,,"[PB questions., ., ., Purdue University Engine...",[Purdue University Engineering Computer Networ...
3,3,Weitek P9000 ? Harris Computer Systems Divisio...,/home/sid015/scikit_learn_data/20news_home/20n...,comp.sys.ibm.pc.hardware,1,_,"[Weitek P9000 ?, Harris Computer Systems Divis...","[Weitek P9000, Robert J.C. Kyanko, Weiteks, Ha..."
4,4,Shuttle Launch Question Smithsonian Astrophysi...,/home/sid015/scikit_learn_data/20news_home/20n...,comp.sys.mac.hardware,14,2,[Shuttle Launch Question Smithsonian Astrophys...,"[Smithsonian Observatory Cambridge MA USA23, S..."
...,...,...,...,...,...,...,...,...
11309,11309,Migraines and scans Invention Factorys BBS - N...,/home/sid015/scikit_learn_data/20news_home/20n...,,13,,[Migraines and scans Invention Factorys BBS - ...,"[ER, Jim Zisfein David Nye, New York City NY, ..."
11310,11310,Screen22 OrganizationTufts University - Medfor...,/home/sid015/scikit_learn_data/20news_home/20n...,,4,,[Screen22 OrganizationTufts University - Medfo...,"[Ethan Bodin, Screen22 OrganizationTufts Unive..."
11311,11311,Mounting CPU Cooler in vertical case Mail Grou...,/home/sid015/scikit_learn_data/20news_home/20n...,,3,,[Mounting CPU Cooler in vertical case Mail Gro...,"[DX2-66, Mail Group13, Will Estes Internet]"
11312,11312,Sphere from 4 points? Central Research Lab. Hi...,/home/sid015/scikit_learn_data/20news_home/20n...,,1,,"[Sphere from 4 points?, Central Research Lab.,...","[Central Research Lab, Tokyo, Graphics Gems Fa..."


In [75]:
for_vec=news_df.explode('NE')

In [97]:
text_for_vect = ' '.join(for_vec.NE.apply(lambda x: str(x)))
count_vect = CountVectorizer()
count_vect = count_vect.fit([text_for_vect])
len(count_vect.vocabulary_)

38433

In [98]:
# X_train, X_test, y_train, y_test = train_test_split(news_df.data, news_df.target.astype(int), test_size=0.33, random_state=42)

X_count = count_vect.transform(news_df.data)

clf = LogisticRegression(max_iter = 100).fit(X_count,news_df.target.astype(int))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [84]:
cd /home/sid015/DSC180/

/home/sid015/DSC180


In [85]:
df_summary_sentences.to_csv('20newsgroupNE.csv',index=False)

In [99]:
twenty_test = fetch_20newsgroups(subset='test')
twenty_test = pd.DataFrame.from_dict(twenty_test,'index').T
twenty_test=twenty_test.dropna(subset=['data'])
twenty_test.data = twenty_test.data.apply(lambda x: clean_text(x)+ '.')

docs_test = twenty_test.data
predicted = clf.predict(count_vect.transform(docs_test))
np.mean(predicted == twenty_test.target)

0.7469463621879979