Preprocessing using Stanfordnlp

In [1]:
# preprocessing- lematization and remove unnecessary stuff,
#from stanfordcorenlp import StanfordCoreNLP

import json
import re
import stanfordnlp
from stanfordcorenlp import StanfordCoreNLP

def loadData():
  with open("Sarcasm_Headlines_Dataset.json","r") as f:
    data = [json.loads(line) for line in f]
    return data 

#removes link key value pair
def extractData(data):
    newData ={}
    for i in range(len(data)):
        newData[i] = {"headline": lemmatize(data[i]["headline"]), "is_sarcastic": data[i]["is_sarcastic"]}
    
    return newData    
  
# initialize StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost', port=8000, timeout=30000, lang='en')

def lemmatize(text):
    # perform lemmatization
    lemmas = []
    output = nlp.annotate(text, properties={'annotators': 'tokenize,lemma', 'outputFormat': 'json'})
    output_dict = json.loads(output)
    tokens = output_dict['sentences'][0]['tokens']
    for token in tokens:
        lemmas.append(token['lemma'])
    return lemmas

# define preprocessing function
def preprocessData():
  data = loadData()
  newdata = extractData(data)

preprocessData()

Getting Contradiction Score

In [13]:
import pysenti
from senticnet.senticnet import SenticNet
import string

import json
import pprint
sn = SenticNet()
import requests

punctuations = set(string.punctuation)
word_scores = {}


def get_top_concepts(words):
    # Concatenate the words to form a comma-separated string
    words_str = ','.join(word.lower() for word in words)
    
    # Make a GET request to the ConceptNet API with the batch of words
    url = f'http://api.conceptnet.io/c/en/{words_str}?rel=/r/RelatedTo&limit=10'
    response = requests.get(url)
    
    # Parse the response JSON and extract the top 5 unique related concepts for each word
    concepts = response.json()['edges']
    results = {}
    for word in words:
        seen_concepts = set()
        top_concepts = []
        for concept in concepts:
            end_label = concept['end']['label']
            if end_label not in seen_concepts:
                top_concepts.append(end_label)
                seen_concepts.add(end_label)
            if len(top_concepts) == 5:
                break
        results[word] = top_concepts
    
    return results

def getScoreConceptNet(words):
    top_concepts = get_top_concepts(words)
    
    # Compute the sentiment score for each concept
    scores = {}
    for word, concepts in top_concepts.items():
        word_scores = []
        for concept in concepts:
            try:
                sn_score = sn.polarity_value(concept)
                sn_score = 4 * float(sn_score)
            except KeyError:
                sn_score = None
            
            if sn_score:
                word_scores.append(sn_score)
        
        # If no sentiment score was found for any of the concepts, set the score as None
        if not word_scores:
            scores[word] = None
        else:
            # Compute the average sentiment score
            scores[word] = sum(word_scores) / len(word_scores)
    
    return scores

def sentiment(headline):
    words = headline.split()
    print(words)
    sum_pos_score = 0
    sum_neg_score = 0
    batch_words = []  # List to store the words for batch API call
    
    for word in words:
        if any(char in punctuations for char in word):
            continue
        
        if word in word_scores:  # Check if the word already exists in the dictionary
            w_score = word_scores[word]
        else:
            try:
                ss_score = pysenti.get_senti(word)
                ss_score = ss_score.scale()
            except ValueError:    
                ss_score = None
            
            try:
                sn_score = sn.polarity_value(word)
                sn_score = 4 * float(sn_score)
            except KeyError:
                sn_score = None
                
            if ss_score is not None and sn_score is not None:
                w_score = (ss_score + sn_score) / 2
            elif ss_score is not None:
                w_score = ss_score
            elif sn_score is not None:
                w_score = sn_score
            else:
                batch_words.append(word)  # Add the word to the batch words list
                w_score = 0
            
            word_scores[word] = w_score  # Store the calculated score    
            
        if w_score > 0:
           sum_pos_score += w_score
        else: 
            sum_neg_score += w_score

    # Batch API call to get scores for words not found in SenticNet and SentiStrength
    if batch_words:
        batch_scores = getScoreConceptNet(batch_words)
        for word, score in batch_scores.items():
            w_score = score if score is not None else 0
            word_scores[word] = w_score
            if w_score > 0:
                sum_pos_score += w_score
            else:
                sum_neg_score += w_score
    
    print("pos:", sum_pos_score)
    print("neg:", sum_neg_score)    
    contra = False
    if sum_pos_score > 0 and sum_neg_score < 0:
        contra = True
    return sum_pos_score, sum_neg_score, contra

# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sia = SentimentIntensityAnalyzer()

# def sentiment(headline):
#     contra=False
#     sum_neg_score=0
#     sum_pos_score=0
#     sentiment_scores = sia.polarity_scores(headline)
#     sum_neg_score=-5*sentiment_scores['neg']        
#     sum_pos_score=5*sentiment_scores['pos']
    
#     if sum_pos_score > 0 and sum_neg_score < 0:
#         contra=True
#     return sum_pos_score,sum_neg_score,contra  

Checking Sentence Coherence

In [29]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

def is_pronoun(token):
    return token.pos_ == "PRON" or token.pos_ == "REFLEX"

def check_coreference(s1, s2, w1, w2):
    doc1 = nlp(s1)
    doc2 = nlp(s2)
    
    w1_tokens = [token for token in doc1 if token.text == w1]
    w2_tokens = [token for token in doc2 if token.text == w2]
    
    # Rule 1: Pronoun match feature
    if w1_tokens and w2_tokens and is_pronoun(w1_tokens[0]) and is_pronoun(w2_tokens[0]):
        if w1_tokens[0].text == w2_tokens[0].text:
            return True
    
    # Rule 2: String match feature
    if w1.lower() == w2.lower() and w1.lower() not in STOP_WORDS:
        return True
    
    # Rule 3: Definite noun phrase feature
    if w2_tokens and w2_tokens[0].text.lower() == "the":
        return True
    
    # Rule 4: Demonstrative noun phrase feature
    demonstratives = {"this", "that", "these", "those", "here", "there", "such", "so", "same"}
    if w2_tokens and w2_tokens[0].text.lower() in demonstratives:
        return True

    # Rule 5: Both proper names feature
    if w1_tokens and w2_tokens and w1_tokens[0].ent_type_ == "PERSON" and w2_tokens[0].ent_type_ == "PERSON":
        return True
    
    # If none of the rules apply, the sentences are not coherent
    return False

def check_coherence(headline):
    sentences = list(nlp(headline).sents)
    s1 = sentences[0].text
    doc1 = nlp(s1)
    w1 = next((token.text for token in doc1 if token.dep_ == "nsubj" or token.dep_ == "ROOT"), None)
    if w1 is None:
        return False
    
    s2 = sentences[-1].text
    doc2 = nlp(s2)
    w2 = next((token.text for token in doc2 if token.dep_ == "dobj" or token.dep_ == "pobj"), None)
    if w2 is None:
        return False

    return check_coreference(s1, s2, w1, w2)



N-Gram Classifier (SVM 1)

In [3]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
import pickle
from sklearn.model_selection import train_test_split, cross_val_score

def train_svm1():
    # Load the training data from the JSON file
    with open("Lemma_Data.json", 'r') as f:
        data = json.load(f)

    # Extract the headlines and labels from the JSON data
    headlines = []
    labels = []
    for i in range(len(data)):
        headline = data[str(i)]['headline']
        label = data[str(i)]['is_sarcastic']
        headlines.append(headline)
        labels.append(label)

    # Create a CountVectorizer object to extract n-gram features
    vectorizer = CountVectorizer(ngram_range=(1, 3))

    # Convert the headlines into a matrix of n-gram counts
    headline_matrix = vectorizer.fit_transform(headlines)

    # Create an SVM classifier
    svm1 = svm.SVC(kernel='linear')

    # Perform 10-fold cross-validation
    scores = cross_val_score(svm1, headline_matrix, labels, cv=10)

    # Print the accuracy for each fold
    for fold, score in enumerate(scores):
        print(f"Fold {fold+1} Accuracy: {score}")

    # Train the SVM classifier on the entire dataset
    svm1.fit(headline_matrix, labels)

    # Save the vectorizer and classifier to files
    with open('vectorizer1.pickle', 'wb') as handle:
        pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open('classifier1.pickle', 'wb') as handle:
        pickle.dump(svm1, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return vectorizer, svm1

# Call the function to train the SVM model and perform 10-fold cross-validation
vectorizer, svm1 = train_svm1()



Fold 1 Accuracy: 0.8468738300262074
Fold 2 Accuracy: 0.8378884312991389
Fold 3 Accuracy: 0.8509921377761138
Fold 4 Accuracy: 0.8483713964807188
Fold 5 Accuracy: 0.8352676900037439
Fold 6 Accuracy: 0.8517409210033695
Fold 7 Accuracy: 0.847997004867091
Fold 8 Accuracy: 0.8483713964807188
Fold 9 Accuracy: 0.842755522276301
Fold 10 Accuracy: 0.8550561797752809


Feature Classification (SVM 2)

In [11]:
#Features Used
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

idiom_pattern = [{'LOWER': 'in'}, {'LOWER': 'the'}, {'LOWER': 'end'}, {'LOWER': 'of'}, {'LOWER': 'the'}, {'LOWER': 'day'}]
matcher.add("idiom", [idiom_pattern])

def getSentimentFeature(sum_pos_score, sum_neg_score):

    # compute sentiment score levels for positive sentiment
    if sum_pos_score <= 1:
        pos_level = "low"
    elif sum_pos_score >= 2:
        pos_level = "high"
    else:
        pos_level = "medium"

    # compute sentiment score levels for negative sentiment
    if sum_neg_score < -2:
        neg_level = "high"
    elif -2 <= sum_neg_score < -1:
        neg_level = "medium"
    else:
        neg_level = "low"

    return pos_level, neg_level

def count_repetitive_punctuations(text):
    pattern = r'(!{2,}|\?{2,}|\.+|,{2,})'
    matches = re.findall(pattern, text)
    return len(matches)

def count_repetitive_characters(text):
    pattern = r'(\w{2,})\1{1,}'
    matches = re.findall(pattern, text)
    return len(matches)

def count_capitalized_words(text):
    words = text.split()
    capitalized_words = [w for w in words if w.isupper()]
    return len(capitalized_words)


def count_slang_booster_words(headline):
    # Define a list of booster and slang words
    booster_words = ['outstanding', 'exceptional', 'remarkable', 'superb', 'excellent', 'terrific', 'fabulous', 'splendid', 'majestic', 'breathtaking', 'wonderful', 'marvelous', 'extraordinary', 'amazing', 'awesome', 'incredible', 'fantastic', 'unbelievable', 'phenomenal', 'mind-blowing', 'epic', 'legendary', 'spectacular', 'jaw-dropping', 'astounding', 'awe-inspiring', 'stunning', 'gorgeous', 'beautiful', 'mesmerizing', 'captivating', 'charming', 'delightful', 'enchanting']
    slang_words = ['lit', 'squad', 'bae', 'fomo', 'salty', 'cray', 'yolo', 'hundo', 'goat', 'savage', 'dope', 'fire', 'fleek', 'lit af', 'thicc', 'on fleek', 'gucci', 'shook', 'fam', 'hella', 'woke', 'turnt', 'baeless', 'basic', 'high-key', 'low-key', 'extra', 'thirsty', 'slay', 'swag', 'trap', 'clap back', 'thot', 'twerk', 'dank', 'sippin tea', 'chill', 'ghost', 'shade', 'throwing shade', 'baewatch', 'bye Felicia']

    # Count the number of booster and slang words
    booster_count = 0
    slang_count = 0
    words=headline.split()
    for token in words:
        if token.lower() in booster_words:
            booster_count += 1
        if token.lower() in slang_words:
            slang_count += 1

    return booster_count + slang_count


def count_exclamation_marks(text):
    exclamation_marks = [c for c in text if c == "!"]
    return len(exclamation_marks)

def count_idioms_in_headline(headline):
    doc = nlp(headline)
    matches = matcher(doc)
    idiom_count = len(matches)
    return idiom_count

def make_features(data):
    # Initialize empty feature dictionary
    features_set={}
    
    features={}
    #print(i)
    headline = data        
    features['headline'] = headline                
    sum_pos_score,sum_neg_score,contra = sentiment(headline)
    doc = nlp(headline)
    # Check the number of sentences in the document
    num_sentences = len(list(doc.sents))
    
    if num_sentences > 1:
        coher=check_coherence(headline)
        if coher==True and contra==True:
            features['contra+coher'] = True
            features['contra'] = True
        elif contra==True:
            features['contra+coher'] = False
            features['contra'] = True
        else:
            features['contra+coher'] = False
            features['contra'] = False
    elif contra==True:
        features['contra+coher'] = False
        features['contra'] = True
    else:
        features['contra+coher'] = False
        features['contra'] = False
    features['sum_pos_score'] = sum_pos_score
    features['sum_neg_score'] = sum_neg_score
    c1=count_repetitive_punctuations(headline)
    if c1==0:
        l=True
        m=False
        h=False
    elif c1 >= 1 and c1 <= 3:
        l=False
        m=True
        h=False
    else:
        l=False
        m=True
        h=False
    P1={'low':l,'med':m,'high':h}
    features['P1_low'] = P1['low']
    features['P1_med'] = P1['med']
    features['P1_high'] = P1['high']
    c2=count_repetitive_characters(headline)
    if c2==0:
        l2=True
        m2=False
        h2=False
    elif c2 >= 1 and c2 <= 3:
        l2=False
        m2=True
        h2=False
    else:
        l2=False
        m2=True
        h2=False
    P2={'low':l2,'med':m2,'high':h2}
    features['P2_low'] = P2['low']
    features['P2_med'] = P2['med']
    features['P2_high'] = P2['high']
    c3=count_capitalized_words(headline)
    if c3==0:
        l3=True
        m3=False
        h3=False
    elif c3 >= 1 and c3 <= 3:
        l3=False
        m3=True
        h3=False
    else:
        l3=False
        m3=True
        h3=False
    P3={'low':l3,'med':m3,'high':h3}
    features['P3_low'] = P3['low']
    features['P3_med'] = P3['med']
    features['P3_high'] = P3['high']
    c4=count_slang_booster_words(headline)
    if c4==0:
        l4=True
        m4=False
        h4=False
    elif c4 >= 1 and c4 <= 3:
        l4=False
        m4=True
        h4=False
    else:
        l4=False
        m4=True
        h4=False
    P4={'low':l4,'med':m4,'high':h4}
    features['P4_low'] = P4['low']
    features['P4_med'] = P4['med']
    features['P4_high'] = P4['high']
    c5=count_exclamation_marks(headline)
    if c5==0:
        l5=True
        m5=False
        h5=False
    elif c5 >= 1 and c5 <= 3:
        l5=False
        m5=True
        h5=False
    else:
        l5=False
        m5=True
        h5=False
    P5={'low':l5,'med':m5,'high':h5}
    features['P5_low'] = P5['low']
    features['P5_med'] = P5['med']
    features['P5_high'] = P5['high']
    c6=count_idioms_in_headline(headline)
    if c6==0:
        l6=True
        m6=False
        h6=False
    elif c6 >= 1 and c6 <= 3:
        l6=False
        m6=True
        h6=False
    else:
        l6=False
        m6=True
        h6=False
    P6={'low':l6,'med':m6,'high':h6}
    features['P6_low'] = P3['low']
    features['P6_med'] = P3['med']
    features['P6_high'] = P3['high']
    pos_level,neg_level=getSentimentFeature(sum_pos_score,sum_neg_score)
    if pos_level == "high":
        features['P7_high'] = True
        features['P7_med'] = False
        features['P7_low'] = False
    if pos_level == "medium":
        features['P7_high'] = False
        features['P7_med'] = True
        features['P7_low'] = False
    if pos_level == "low":
        features['P7_high'] = False
        features['P7_med'] = False
        features['P7_low'] = True
    if neg_level == "high":
        features['P8_high'] = True
        features['P8_med'] = False
        features['P8_low'] = False
    if neg_level == "medium":
        features['P8_high'] = False
        features['P8_med'] = True
        features['P8_low'] = False
    if neg_level == "low":
        features['P8_high'] = False
        features['P8_med'] = False
        features['P8_low'] = True      

    
    return features


In [7]:
#Implementation of these features
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
import re
import json
import pickle

def make_features_svm(data):
    # Initialize empty feature dictionary
    features_set={}
    for i in range(len(data)):
        features={}
        print(i)
        headline = data[str(i)]['headline']        
        features['headline'] = headline                
        sum_pos_score,sum_neg_score,contra = sentiment(headline)
        doc = nlp(headline)
        # Check the number of sentences in the document
        num_sentences = len(list(doc.sents))
        
        if num_sentences > 1:
            coher=check_coherence(headline)
            if coher==True and contra==True:
                features['contra+coher'] = True
                features['contra'] = True
            elif contra==True:
                features['contra+coher'] = False
                features['contra'] = True
            else:
                features['contra+coher'] = False
                features['contra'] = False
        elif contra==True:
            features['contra+coher'] = False
            features['contra'] = True
        else:
            features['contra+coher'] = False
            features['contra'] = False
        features['sum_pos_score'] = sum_pos_score
        features['sum_neg_score'] = sum_neg_score
        c1=count_repetitive_punctuations(headline)
        if c1==0:
            l=True
            m=False
            h=False
        elif c1 >= 1 and c1 <= 3:
            l=False
            m=True
            h=False
        else:
            l=False
            m=True
            h=False
        P1={'low':l,'med':m,'high':h}
        features['P1_low'] = P1['low']
        features['P1_med'] = P1['med']
        features['P1_high'] = P1['high']
        c2=count_repetitive_characters(headline)
        if c2==0:
            l2=True
            m2=False
            h2=False
        elif c2 >= 1 and c2 <= 3:
            l2=False
            m2=True
            h2=False
        else:
            l2=False
            m2=True
            h2=False
        P2={'low':l2,'med':m2,'high':h2}
        features['P2_low'] = P2['low']
        features['P2_med'] = P2['med']
        features['P2_high'] = P2['high']
        c3=count_capitalized_words(headline)
        if c3==0:
            l3=True
            m3=False
            h3=False
        elif c3 >= 1 and c3 <= 3:
            l3=False
            m3=True
            h3=False
        else:
            l3=False
            m3=True
            h3=False
        P3={'low':l3,'med':m3,'high':h3}
        features['P3_low'] = P3['low']
        features['P3_med'] = P3['med']
        features['P3_high'] = P3['high']
        c4=count_slang_booster_words(headline)
        if c4==0:
            l4=True
            m4=False
            h4=False
        elif c4 >= 1 and c4 <= 3:
            l4=False
            m4=True
            h4=False
        else:
            l4=False
            m4=True
            h4=False
        P4={'low':l4,'med':m4,'high':h4}
        features['P4_low'] = P4['low']
        features['P4_med'] = P4['med']
        features['P4_high'] = P4['high']
        c5=count_exclamation_marks(headline)
        if c5==0:
            l5=True
            m5=False
            h5=False
        elif c5 >= 1 and c5 <= 3:
            l5=False
            m5=True
            h5=False
        else:
            l5=False
            m5=True
            h5=False
        P5={'low':l5,'med':m5,'high':h5}
        features['P5_low'] = P5['low']
        features['P5_med'] = P5['med']
        features['P5_high'] = P5['high']
        c6=count_idioms_in_headline(headline)
        if c6==0:
            l6=True
            m6=False
            h6=False
        elif c6 >= 1 and c6 <= 3:
            l6=False
            m6=True
            h6=False
        else:
            l6=False
            m6=True
            h6=False
        P6={'low':l6,'med':m6,'high':h6}
        features['P6_low'] = P3['low']
        features['P6_med'] = P3['med']
        features['P6_high'] = P3['high']
        pos_level,neg_level=getSentimentFeature(sum_pos_score,sum_neg_score)
        if pos_level == "high":
            features['P7_high'] = True
            features['P7_med'] = False
            features['P7_low'] = False
        if pos_level == "medium":
            features['P7_high'] = False
            features['P7_med'] = True
            features['P7_low'] = False
        if pos_level == "low":
            features['P7_high'] = False
            features['P7_med'] = False
            features['P7_low'] = True
        if neg_level == "high":
            features['P8_high'] = True
            features['P8_med'] = False
            features['P8_low'] = False
        if neg_level == "medium":
            features['P8_high'] = False
            features['P8_med'] = True
            features['P8_low'] = False
        if neg_level == "low":
            features['P8_high'] = False
            features['P8_med'] = False
            features['P8_low'] = True      
        features_set[i]=features
        
    return features_set

# Function to extract features from a headline
def extract_features(headline_dict, i):
    # Initialize empty feature dictionary
    features = {}
    i=str(i)

    # Extract the required features
    features['sum_neg_score'] = headline_dict[i]['sum_neg_score']
    features['sum_pos_score'] = headline_dict[i]['sum_pos_score']
    features['contra'] = headline_dict[i]['contra']
    features['contra+coher'] = headline_dict[i]['contra+coher']
    features['P1_low']=headline_dict[i]['P1_low']
    features['P1_med']=headline_dict[i]['P1_med']
    features['P1_high']=headline_dict[i]['P1_high']

    features['P2_low']=headline_dict[i]['P2_low']
    features['P2_med']=headline_dict[i]['P2_med']
    features['P2_high']=headline_dict[i]['P2_high']

    features['P3_low']=headline_dict[i]['P3_low']
    features['P3_med']=headline_dict[i]['P3_med']
    features['P3_high']=headline_dict[i]['P3_high']

    features['P4_low']=headline_dict[i]['P4_low']
    features['P4_med']=headline_dict[i]['P4_med']
    features['P4_high']=headline_dict[i]['P4_high']

    features['P5_low']=headline_dict[i]['P5_low']
    features['P5_med']=headline_dict[i]['P5_med']
    features['P5_high']=headline_dict[i]['P5_high']

    features['P6_low']=headline_dict[i]['P6_low']
    features['P6_med']=headline_dict[i]['P6_med']
    features['P6_high']=headline_dict[i]['P6_high']
    
    features['P7_low']=headline_dict[i]['P7_low']
    features['P7_med']=headline_dict[i]['P7_med']
    features['P7_high']=headline_dict[i]['P7_high']

    features['P8_low']=headline_dict[i]['P8_low']
    features['P8_med']=headline_dict[i]['P8_med']
    features['P8_high']=headline_dict[i]['P8_high']

    return features




from sklearn.model_selection import KFold
import numpy as np

# Function to train SVM classifier using 10-fold cross-validation
def train_svm2():
    # Load the training data from the JSON file
    with open("Lemma_Data.json", 'r') as f:
        data = json.load(f)

    with open("Features.json", "r") as f:
        new_features = json.load(f)

    # Extract the headlines and labels from the JSON data
    headlines = []
    labels = []
    feature_data = {}
    for i in range(len(data)):
        headline = data[str(i)]['headline']
        label = data[str(i)]['is_sarcastic']
        headlines.append(headline)
        labels.append(label)

    # Extract features from the headlines using the extract_features function
    features = [extract_features(new_features, i) for i in range(len(new_features))]

    # Create a DictVectorizer object to convert the feature dictionaries into matrices
    vectorizer = DictVectorizer()

    # Convert the feature dictionaries into a matrix of feature vectors
    feature_matrix = vectorizer.fit_transform(features)

    # Perform 10-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    accuracies = []

    for train_index, test_index in kf.split(feature_matrix):
        X_train, X_test = feature_matrix[train_index], feature_matrix[test_index]
        y_train, y_test = np.array(labels)[train_index], np.array(labels)[test_index]

        # Create an SVM classifier and fit it to the training data
        clf = svm.SVC(kernel='linear')
        clf.fit(X_train, y_train)

        # Evaluate the classifier on the testing data
        accuracy = clf.score(X_test, y_test)
        accuracies.append(accuracy)

    # Calculate the average accuracy across all folds
    avg_accuracy = np.mean(accuracies)
    print("Average Accuracy:", avg_accuracy)

    # Save the vectorizer and classifier to files
    with open('vectorizer2.pickle', 'wb') as handle:
        pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open('classifier2.pickle', 'wb') as handle:
        pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return vectorizer, clf

# Example usage
vectorizer2, svm2 = train_svm2()


Average Accuracy: 0.56535231372615


Merge SVMs

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Load the saved classifiers
with open('vectorizer1.pickle', 'rb') as handle:
    vectorizer1 = pickle.load(handle)

with open('classifier1.pickle', 'rb') as handle:
    svm1 = pickle.load(handle)

with open('vectorizer2.pickle', 'rb') as handle:
    vectorizer2 = pickle.load(handle)

with open('classifier2.pickle', 'rb') as handle:
    svm2 = pickle.load(handle)

# Load the test data
with open("Lemma_Data.json", 'r') as f:
    data = json.load(f)

# Extract the headlines and labels from the test data
test_headlines = []
test_labels = []
for i in range(len(data)):
    headline = data[str(i)]['headline']
    label = data[str(i)]['is_sarcastic']
    test_headlines.append(headline)
    test_labels.append(label)

with open("Features.json", "r") as f:
    new_features = json.load(f)

features = [extract_features(new_features, i) for i in range(len(new_features))]
test_matrix1 = vectorizer1.transform(test_headlines)
test_matrix2 = vectorizer2.transform(features)

# Predict the labels for the test data using both SVM classifiers
predicted_labels1 = svm1.predict(test_matrix1)
predicted_labels2 = svm2.predict(test_matrix2)

# Perform majority voting to merge the predictions
merged_predictions = []
for label1, label2 in zip(predicted_labels1, predicted_labels2):
    if label1 == label2:
        merged_predictions.append(label1)
    else:
        merged_predictions.append(0)  # Assign a default label when there's no majority

# Apply 10-fold cross-validation to the merged predictions
scores = cross_val_score(svm1, test_matrix1, test_labels, cv=10)
average_accuracy = scores.mean()
print("Merged SVM Accuracy (10-fold CV):", average_accuracy)


Testing

In [32]:
import nltk
from nltk.stem import WordNetLemmatizer

#nltk.download('wordnet')

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = []

    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Lemmatize each word
    for word in words:
        lemmatized_word = lemmatizer.lemmatize(word)
        lemmatized_text.append(lemmatized_word)

    # Join the lemmatized words back into a single string
    lemmatized_text = ' '.join(lemmatized_text)

    return lemmatized_text
headline= "Trump is the new jesus!!!!"


headline=lemmatize_text(headline)


with open('vectorizer2.pickle', 'rb') as handle:
   vectorizer2 = pickle.load(handle)


with open('classifier2.pickle', 'rb') as handle:
   svm2 = pickle.load(handle)

# Load the vectorizer from file
with open('vectorizer1.pickle', 'rb') as handle:
    vectorizer1 = pickle.load(handle)

# Load the classifier from file
with open('classifier1.pickle', 'rb') as handle:
    svm1 = pickle.load(handle)
# Predict the label for a new headline
new_matrix = vectorizer1.transform([headline])

features=make_features(headline)
new_matrix_svm2=vectorizer2.transform(features)
class1 = svm1.predict(new_matrix)[0]
class2 = svm2.predict(new_matrix_svm2)[0]

if class1 == class2:
    # The classifiers agree, so we use the majority vote
    final_result = class1
else:
    # The classifiers disagree, so we need to compare the margins
    margin1 = svm1.decision_function(new_matrix)
    margin2 = svm2.decision_function(new_matrix_svm2)
    
    if margin1 > margin2:
        final_result = class1
    else:
        final_result = class2

#print(final_result)

if final_result == 1:
    print('Headline is sarcastic')
else:
    print('Headline is not sarcastic')   

Headline is not sarcastic
