In [363]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [364]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [365]:
DATA_JSON_FILE = '/content/drive/MyDrive/News_Category_Dataset_v2.json'


In [366]:
data = pd.read_json(DATA_JSON_FILE, lines=True)
data

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26
...,...,...,...,...,...,...
200848,TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,"Reuters, Reuters",https://www.huffingtonpost.com/entry/rim-ceo-t...,Verizon Wireless and AT&T are already promotin...,2012-01-28
200849,SPORTS,Maria Sharapova Stunned By Victoria Azarenka I...,,https://www.huffingtonpost.com/entry/maria-sha...,"Afterward, Azarenka, more effusive with the pr...",2012-01-28
200850,SPORTS,"Giants Over Patriots, Jets Over Colts Among M...",,https://www.huffingtonpost.com/entry/super-bow...,"Leading up to Super Bowl XLVI, the most talked...",2012-01-28
200851,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...,,https://www.huffingtonpost.com/entry/aldon-smi...,CORRECTION: An earlier version of this story i...,2012-01-28


In [367]:
target_category = data['category'].unique()
print(target_category)

['CRIME' 'ENTERTAINMENT' 'WORLD NEWS' 'IMPACT' 'POLITICS' 'WEIRD NEWS'
 'BLACK VOICES' 'WOMEN' 'COMEDY' 'QUEER VOICES' 'SPORTS' 'BUSINESS'
 'TRAVEL' 'MEDIA' 'TECH' 'RELIGION' 'SCIENCE' 'LATINO VOICES' 'EDUCATION'
 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'THE WORLDPOST' 'GOOD NEWS' 'WORLDPOST' 'FIFTY' 'ARTS'
 'WELLNESS' 'PARENTING' 'HOME & LIVING' 'STYLE & BEAUTY' 'DIVORCE'
 'WEDDINGS' 'FOOD & DRINK' 'MONEY' 'ENVIRONMENT' 'CULTURE & ARTS']


In [368]:
data['categoryId'] = data['category'].factorize()[0]
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date,categoryId
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26,0
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,1
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,1
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,1
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26,1


In [369]:
category = data[['category', 'categoryId']].drop_duplicates().sort_values('categoryId')
category

Unnamed: 0,category,categoryId
0,CRIME,0
1,ENTERTAINMENT,1
11,WORLD NEWS,2
12,IMPACT,3
13,POLITICS,4
20,WEIRD NEWS,5
27,BLACK VOICES,6
35,WOMEN,7
37,COMEDY,8
69,QUEER VOICES,9


In [370]:
df = pd.DataFrame(data)
df.drop(df.iloc[:, 2::], inplace = True, axis = 1)
df = df[list(df.columns[~df.columns.duplicated()])]

In [371]:
data['category'].value_counts()


POLITICS          32739
WELLNESS          17827
ENTERTAINMENT     16058
TRAVEL             9887
STYLE & BEAUTY     9649
PARENTING          8677
HEALTHY LIVING     6694
QUEER VOICES       6314
FOOD & DRINK       6226
BUSINESS           5937
COMEDY             5175
SPORTS             4884
BLACK VOICES       4528
HOME & LIVING      4195
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3651
WOMEN              3490
IMPACT             3459
DIVORCE            3426
CRIME              3405
MEDIA              2815
WEIRD NEWS         2670
GREEN              2622
WORLDPOST          2579
RELIGION           2556
STYLE              2254
SCIENCE            2178
WORLD NEWS         2177
TASTE              2096
TECH               2082
MONEY              1707
ARTS               1509
FIFTY              1401
GOOD NEWS          1398
ARTS & CULTURE     1339
ENVIRONMENT        1323
COLLEGE            1144
LATINO VOICES      1129
CULTURE & ARTS     1030
EDUCATION          1004
Name: category, 

In [372]:
# After cleaning
df['headline'] = df['headline'].str.replace('\W', ' ') # Removes punctuation
df['headline'] = df['headline'].str.lower()
df.head(20)

Unnamed: 0,category,headline
0,CRIME,there were 2 mass shootings in texas last week...
1,ENTERTAINMENT,will smith joins diplo and nicky jam for the 2...
2,ENTERTAINMENT,hugh grant marries for the first time at age 57
3,ENTERTAINMENT,jim carrey blasts castrato adam schiff and d...
4,ENTERTAINMENT,julianna margulies uses donald trump poop bags...
5,ENTERTAINMENT,morgan freeman devastated that sexual harass...
6,ENTERTAINMENT,donald trump is lovin new mcdonald s jingle i...
7,ENTERTAINMENT,what to watch on amazon prime that s new this ...
8,ENTERTAINMENT,mike myers reveals he d like to do a fourth ...
9,ENTERTAINMENT,what to watch on hulu that s new this week


In [373]:
# def text_filtering(text):
#     #stop_words = list(stopwords.words('english'))
#     stop_words=[]
#     #stop_words=['those','is','and','a','at','these','for','these']
#                                               #removing unnecessary characters
#     for word in text.split(" "):
#         if word in stop_words:
#             text=text.replace(word,'')
#     text = text.rstrip() 
#     regular_expression = '[^a-zA-z\s]'
#     text = re.sub(regular_expression, '', text)
#     return text
# text_filtering("")

In [374]:
train_data = data[:int(0.9*len(data))]
test_and_dev_data = data[int(0.9*len(data)):]
                                                                       #splitting the data

dev_data=test_and_dev_data[:int(0.5 * len(test_and_dev_data))]
test_data=test_and_dev_data[int(0.5 * len(test_and_dev_data)):]

print(train_data)

             category                                           headline  \
0               CRIME  There Were 2 Mass Shootings In Texas Last Week...   
1       ENTERTAINMENT  Will Smith Joins Diplo And Nicky Jam For The 2...   
2       ENTERTAINMENT    Hugh Grant Marries For The First Time At Age 57   
3       ENTERTAINMENT  Jim Carrey Blasts 'Castrato' Adam Schiff And D...   
4       ENTERTAINMENT  Julianna Margulies Uses Donald Trump Poop Bags...   
...               ...                                                ...   
180762  HOME & LIVING   Craft Of The Day: Make A Pillow For Francophiles   
180763  HOME & LIVING  How To Find The Best Pillow For Back, Belly Or...   
180764   FOOD & DRINK               The 7 Best Boardwalks In The Country   
180765         TRAVEL  Torrance And Temecula: Much To My Surprise (PH...   
180766      PARENTING                       School Rules = Lower Obesity   

                                                  authors  \
0                         

In [406]:
#Splitting the dataset into Train, Validayelp_labelled

#Train dataset
X_train = train_data['headline']
Y_train = train_data['category']
                                                     #separating features and targets
#Dev dataset
X_dev = dev_data['headline']
Y_dev = dev_data['category']

#Test Dataset
X_test = test_data['headline']
Y_test = test_data['category']


print("Training data shape: ",train_data.shape)
print("Validation data shape: ",dev_data.shape)
print("Testing data shape: ",test_data.shape)
print(X_train[0])

Training data shape:  (180767, 7)
Validation data shape:  (10043, 7)
Testing data shape:  (10043, 7)
['There', 'Were', '2', 'Mass', 'Shootings', 'In', 'Texas', 'Last', 'Week,', 'But', 'Only', '1', 'On', 'TV']


In [376]:
train_category1 = train_data.loc[train_data.category == 'CRIME']
train_category2 = train_data.loc[train_data.category == 	'ENTERTAINMENT']
train_category3 = train_data.loc[train_data.category == 	'WORLD NEWS']
def cal_prior_prob(Y_train):
  prior_Probability = {}
  for category in np.unique(Y_train):
    categorycount = sum(Y_train == category)
    prior_Probability[category] = categorycount / Y_train.size
  return prior_Probability

prior_Probability = cal_prior_prob(Y_train)


print("Total articles of category crime in train dataset : ",len(train_category1))
print("Total articles of category ENTERTAINMENT in train dataset : ",len(train_category2))
print("Total articles of category WORLD NEWS in train dataset : ",len(train_category3))

print("Prior probability in train article dataset : ",prior_Probability['CRIME'])
print("Prior probability in train article dataset : ",prior_Probability['ENTERTAINMENT'])
print("Prior probability in train article dataset : ",prior_Probability['WORLD NEWS'])

Total articles of category crime in train dataset :  3289
Total articles of category ENTERTAINMENT in train dataset :  15715
Total articles of category WORLD NEWS in train dataset :  2177
Prior probability in train article dataset :  0.018194692615355678
Prior probability in train article dataset :  0.08693511536950882
Prior probability in train article dataset :  0.012043127340720374


In [377]:
train_data['headline'] = train_data['headline'].str.split()
vocab = []
for headline in train_data['headline']:
   for word in headline:
      vocab.append(word)

vocab = list(set(vocab))
vocab

['Matriarchs',
 'Furthest',
 "Women.'",
 '65,000',
 'Knowles:',
 'Vintage-Looking',
 'Solution.',
 'Bock:',
 'Fagen',
 'Indonesian',
 'Made-in-New',
 'Fundraising',
 'Jailing',
 "'Kiki'",
 'Alsace',
 'Tupperware',
 'Hemingway,',
 'Guilty’',
 'Complaining',
 'Warships,',
 'Simple:',
 'Known',
 'US-Mexico',
 'Unions:',
 "Spurs'",
 'MAINSTREAM',
 "'Apps'",
 'L-I-T',
 "CPD's",
 "Beating'",
 'More:',
 'Bourdin',
 'Satisfy',
 'Networking!',
 'Anchoring,',
 'Animism:',
 "Ambien'",
 'Debunk',
 'Weren’t',
 'billionaire',
 "'Blond'",
 "Creator's",
 'Him.)',
 'Chocolate-Eating',
 'Infighting',
 'Most-Watched',
 'Season!',
 'Figure',
 'Costly,',
 'Ease,',
 'MK',
 "Back,'",
 'Believe...',
 'Australians,',
 "Leather'",
 'Subway.',
 'Dellen:',
 'Nose,',
 "Evil'",
 'VIDEOS)',
 'Carpenter',
 'Coleman:',
 "Labelle's",
 'Ultra-Conservatives',
 'Stepmom?',
 'YMCAs',
 'Scalise',
 'Brady:',
 'TK',
 "'Mind-Boggling,'",
 'Beautiful',
 '‘Conservative’',
 'Re-Enactors',
 'Obamacare:',
 'Taraji,',
 '(Un)Consciou

In [379]:
stop_words=[]
f = open("/content/drive/MyDrive/stopwords.txt", "r")
for each in f.read().split(','):
    stop_words.append(each.replace("'","").strip())

def build_word_dict(data):
    word_freq = {}
    all_words=[]
    omit=[]
    for sentence in data:
            words=[]
            sentence = text_filtering(sentence)                               #building vovab list and word dictionary
            words=sentence.split(' ')
            for word in set(words) :
                word = word.lower()
                if word not in word_freq.keys():
                    word_freq[word] = 1
                    all_words.append(word)
                else:
                    word_freq[word] += 1
    return word_freq, all_words

    
       
allwords_freq,allwords_train=build_word_dict(vocab)
print(allwords_train)
allwords_freq

# def calculate_word_frequency(cat):
#     freq_dict = {}
#     for category in cat:
#             if word not in stop_words:
#                 if word not in freq_dict.keys():
#                     freq_dict[word] = 1
#                 else:
#                     freq_dict[word] += 1
  
#     return freq_dict

# freq_dict = calculate_word_frequency(X_train)

# frequency_words_train_category1 = calculate_word_frequency(train_category1.headline)

# frequency_words_train_category2 = calculate_word_frequency(train_category2.headline)

# frequency_words_train_category3 = calculate_word_frequency(train_category3.headline)

print("Frequency of word in Train data", len(allwords_freq))
print("Frequency of word in Crime category", len(allwords_train))
print("Frequency of word in entertainment category", allwords_freq.keys())
# print("Frequency of word in entertainment category", frequency_words_train_category3)

Frequency of word in Train data 61415
Frequency of word in Crime category 61415


In [380]:
allwords_freq,allwords_train=build_word_dict(vocab) 
allwords_freq                     #probability of all words
for word in allwords_freq:
        print(word,"--",allwords_freq[word],"     Probability of",word,"  is =",allwords_freq[word]/len(train_data))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
mouses -- 1      Probability of mouses   is = 5.531983160643259e-06
amigos -- 1      Probability of amigos   is = 5.531983160643259e-06
jesse -- 2      Probability of jesse   is = 1.1063966321286519e-05
mere -- 1      Probability of mere   is = 5.531983160643259e-06
cardi -- 2      Probability of cardi   is = 1.1063966321286519e-05
tawny -- 1      Probability of tawny   is = 5.531983160643259e-06
screamed -- 1      Probability of screamed   is = 5.531983160643259e-06
wegman -- 1      Probability of wegman   is = 5.531983160643259e-06
argus -- 1      Probability of argus   is = 5.531983160643259e-06
kirsh -- 1      Probability of kirsh   is = 5.531983160643259e-06
endofschool -- 1      Probability of endofschool   is = 5.531983160643259e-06
gnawing -- 1      Probability of gnawing   is = 5.531983160643259e-06
mongol -- 1      Probability of mongol   is = 5.531983160643259e-06
symphonies -- 1      Probability of symphonies 

In [410]:
#calculating conditional prob

def conditional_prob(train_data,X_train):
    train_pos_reviews=train_data.loc[train_data.category=='CRIME'].headline
    pos_words_freq,pos_words=build_word_dict(train_pos_reviews)
    train_neg_reviews=train_data.loc[train_data.category=='ENTERTAINMENT'].headline
    neg_words_freq,neg_words=build_word_dict(train_neg_reviews)

    neg_words_freq.pop('', None)                         #calculating conditional probabilities
    neg_condi_prob={}
    for word in neg_words_freq:
        #print(word,"--",neg_words_freq[word],"      Conditional probability of",word," given sentiment is neg=",neg_words_freq[word]/len(train_neg_reviews))
        neg_condi_prob[word]=neg_words_freq[word]/len(train_neg_reviews) 
        
    pos_words_freq.pop('', None)
    pos_condi_prob={}
    for word in pos_words_freq:
        #print(word,"--",pos_words_freq[word],"      Conditional probability of",word," given sentiment is pos=",pos_words_freq[word]/len(train_pos_reviews))
        pos_condi_prob[word]=pos_words_freq[word]/len(train_pos_reviews)
    return pos_condi_prob,neg_condi_prob
    
pos_condi_prob_train,neg_condi_prob_train=conditional_prob(train_data,X_train) 
# print(pos_condi_prob_train)
# print(neg_condi_prob_train)

AttributeError: ignored

In [384]:
from itertools import islice
dict1 = pos_condi_prob_train
sorted_dict = {}
sorted_keys = sorted(dict1, key=dict1.get,reverse=True)  

for w in sorted_keys:
    sorted_dict[w] = dict1[w]

def take(n, iterable):
    
    return list(islice(iterable, n))

n_items_pos = take(10, sorted_dict.items())
print(n_items_pos)

[('in', 0.29857099422316813), ('of', 0.1951961082395865), ('to', 0.18242626938279113), ('police', 0.13286713286713286), ('for', 0.12374581939799331), ('man', 0.12283368805107936), ('after', 0.12009729401033749), ('on', 0.08786865308604438), ('with', 0.08300395256916997), ('the', 0.07874733961690483)]


In [383]:
dict1 = neg_condi_prob_train
sorted_dict = {}
sorted_keys = sorted(dict1, key=dict1.get,reverse=True)  

for w in sorted_keys:
    sorted_dict[w] = dict1[w]
    
n_items_neg = take(10, sorted_dict.items())
print(n_items_neg)

[('the', 0.3077314667515113), ('to', 0.19032771237671015), ('in', 0.16175628380528156), ('of', 0.1599109131403118), ('a', 0.1578110085905186), ('and', 0.15583837098313713), ('is', 0.12446706967865097), ('for', 0.1086859688195991), ('on', 0.1070314985682469), ('with', 0.09615017499204581)]


calculating accuracy

In [382]:
def accuracy(real, prediction):
    count = 0
    for i in range(len(real)):                                #calculating accuracy
        if real[i] == prediction[i]:
            count+=1
    accuracy=count/len(real)*100
    return accuracy



In [None]:
def fit(train_data):
    y_preds = []
    
    word_freq = calculate_word_frequency(train_data)

    wordsFrequencyDict_Neg_probability, wordsFrequencyDict_Pos_probability = conditional_probability(neg_condi_prob_train,pos_condi_prob_train)
    liklihood_0 = 1
    liklihood_1 = 1

    for word in word_freq.keys():
        if not word in wordsFrequencyDict_Neg_probability:
            wordsFrequencyDict_Neg_probability[word] = 0.0
        
        if not word in wordsFrequencyDict_Pos_probability:
            wordsFrequencyDict_Pos_probability[word] = 0.0
        
        liklihood_0 = liklihood_0 * wordsFrequencyDict_Neg_probability[word]
        liklihood_1 = liklihood_1 * wordsFrequencyDict_Pos_probability[word]
    
    posterior_0 = prior_Probability['CRIME']*liklihood_0
    posterior_1 = prior_Probability['ENTERTAINMENT']*liklihood_1

    #Final class probabilities comparison
    if posterior_0 > posterior_1:
        prediction = 0
    else:
        prediction = 1
    
    y_preds.append(prediction)
    return y_preds

model = fit(X_train)
train_accuracy = accuracy(model, y_train)
print(f'Accuracy on Train Dataset is : {train_accuracy} %')

In [385]:
# a method to split the dataset into given no of folds
def cross_validation_split(dataset, n_folds):
  dataset_split = []
  dataset_copy = dataset
  fold_size = int(len(dataset) / n_folds)
  for _ in range(n_folds):
    fold = []
    while len(fold) < fold_size:
      r = randrange(dataset_copy.shape[0])
      index = dataset_copy.index[r]
      fold.append(dataset_copy.loc[index].values.tolist())
      dataset_copy = dataset_copy.drop(index)
    dataset_split.append(fold)
  return dataset_split

In [386]:
import numpy as np
from sklearn.model_selection import KFold
# data sample
data = np.array(dev_data)
# prepare cross validation
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
for train, test in kfold.split(data):
  print('train: %s, test: %s' % (data[train], data[test]))

train: [['MONEY' 'America Deserves a Raise'
  "Rep. Dennis Kucinich, Contributor\nU.S. Representative from Ohio's 10th District"
  ...
  'There can be no better way to honor the American worker then to support an increase in the minimum wage to a living wage of at least $10 per hour. That would be one Happy Labor Day.'
  Timestamp('2012-08-31 00:00:00') 38]
 ['STYLE & BEAUTY'
  'Abercrombie & Fitch Fails To Impress Teens With Sex Appeal' '' ...
  'Read more at BusinessWeek.com and sound off below: do you still shop at Abercrombie & Fitch? When we recently asked our staff'
  Timestamp('2012-08-31 00:00:00') 34]
 ['WEDDINGS'
  'TV Show Proposals: 8 Of The Best Fictional TV Show Marriage Proposals'
  '' ...
  "Real-life proposals are big productions these days. So, it really isn't a surprise that many television show characters opt"
  Timestamp('2012-08-31 00:00:00') 36]
 ...
 ['WEDDINGS' 'Celebrity Wives: Who Is The Best Hollywood Spouse?' '' ...
  'Hollywood wives might not have the bes

In [388]:
train_pos_reviews=train_data.loc[train_data.category=='CRIME'].headline
train_neg_reviews=train_data.loc[train_data.category=='ENTERTAINMENT'].headline
train_pos_reviews
# pos_words_freq,pos_words=build_word_dict(train_pos_reviews)
# neg_words_freq,neg_words=build_word_dict(train_neg_reviews)

0         [There, Were, 2, Mass, Shootings, In, Texas, L...
32        [Rachel, Dolezal, Faces, Felony, Charges, For,...
40        [Man, Faces, Charges, After, Pulling, Knife,, ...
42        [2, People, Injured, In, Indiana, School, Shoo...
185       [Maryland, Police, Charge, 3, Church, Leaders,...
                                ...                        
180583    [Doris, Thompson,, 82-Year-Old, Career, Crimin...
180584         [Toddler's, Karaoke, Leads, To, Two, Deaths]
180651    [Gabrielle, Swainson, Suspect,, Freddie, Grant...
180652    [Phone, Sex, Is, Not, Prostitution,, Italian, ...
180722                                    [The, Throwaways]
Name: headline, Length: 3289, dtype: object

In [387]:
pos_words_freq,pos_words=build_word_dict(vocab)
neg_words_freq,neg_words=build_word_dict(vocab)

In [402]:
def text_filtering(text):
    #stop_words = list(stopwords.words('english'))
    stop_words=[]
    #stop_words=['those','is','and','a','at','these','for','these']
    text = text.replace('\t', '')
    text = text.replace('\n', '')
    text = text.replace('\r', '')                                          #removing unnecessary characters
    for word in text.split(" "):
        if word in stop_words:
            text=text.replace(word,'')
    text = text.rstrip() 
    regular_expression = '[^a-zA-z\s]'
    text = re.sub(regular_expression, '', text)
    return text
text_filtering("")

''

In [408]:
def pred(data,ans):
    y_preds=[]
    for sentence in data:
        sentence=text_filtering(sentence)
        pos_lik=1
        neg_lik=1
        
        for word in sentence.split(' '):
            word=word.lower()                                        #predictions with smoothing
            if word not in pos_condi_prob.keys():
                pos_condi_prob[word]=1/(len(train_pos_reviews)+len(pos_words))
            if word not in neg_condi_prob.keys():
                neg_condi_prob[word]=1/(len(train_neg_reviews)+len(neg_words))
            pos_lik*=pos_condi_prob[word]
            neg_lik*=neg_condi_prob[word]
        pos_posterior=pos_lik*prior_Probability['ENTERTAINMENT']
        neg_posterior=neg_lik*prior_Probability['CRIME']
        if pos_posterior>neg_posterior:
            y_preds.append(1)
        else:
            y_preds.append(0)
    
    return score(ans.tolist(),y_preds)

print(pred(X_train,Y_train))
# print("Accuracy on training data after smoothing--",pred(X_train,Y_train) )   
# print("Accuracy on development data after smoothing--",pred(X_dev,Y_dev))
# print("Accuracy on test data after smoothing--",pred(X_test,Y_test)) 

AttributeError: ignored