In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import json
import re
import numpy as np
import spacy
import tqdm
import xgboost as xgb
import lightgbm as lgb
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from nltk import pos_tag, word_tokenize
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Function to read JSON lines file
def read_json_lines(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data


In [None]:
# Load the datasets
file1_path = '/content/drive/MyDrive/Sarcasm_Headlines_Dataset.json'
file2_path = '/content/drive/MyDrive/Sarcasm_Headlines_Dataset_v2.json'

df1 = pd.read_json(file1_path, lines=True)
df2 = pd.read_json(file2_path, lines=True)

# Concatenate the datasets
df = pd.concat([df1, df2], ignore_index=True)

In [None]:


# Separate sarcastic and non-sarcastic headlines
sarcastic_headlines = df[df['is_sarcastic'] == 1]['headline']
non_sarcastic_headlines = df[df['is_sarcastic'] == 0]['headline']

# Extract three examples each
sarcastic_examples = sarcastic_headlines.sample(10, random_state=42).tolist()
non_sarcastic_examples = non_sarcastic_headlines.sample(10, random_state=42).tolist()

print("Sarcastic Headlines:")
for i, headline in enumerate(sarcastic_examples, 1):
    print(f"{i}. {headline}")

print("\nNon-Sarcastic Headlines:")
for i, headline in enumerate(non_sarcastic_examples, 1):
    print(f"{i}. {headline}")


In [None]:
import pandas as pd

# Extracting 3 sarcastic sentences
sarcastic_samples = df[df['is_sarcastic'] == 1]['headline'].sample(3).tolist()

# Extracting 3 non-sarcastic sentences
non_sarcastic_samples = df[df['is_sarcastic'] == 0]['headline'].sample(3).tolist()

# Creating a DataFrame to display the samples
data = {
    'Sentence': sarcastic_samples + non_sarcastic_samples,
    'Expression': ['sarcastic'] * 3 + ['non-sarcastic'] * 3
}

df_samples = pd.DataFrame(data)

# Displaying the table
print(df_samples)


In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# Separate sarcastic and non-sarcastic headlines
sarcastic_headlines = df[df['is_sarcastic'] == 1]['headline']
non_sarcastic_headlines = df[df['is_sarcastic'] == 0]['headline']

# Create word clouds
sarcastic_text = ' '.join(sarcastic_headlines)
non_sarcastic_text = ' '.join(non_sarcastic_headlines)

sarcastic_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(sarcastic_text)
non_sarcastic_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(non_sarcastic_text)

# Plot the word clouds
plt.figure(figsize=(16, 8))

plt.subplot(1, 2, 1)
plt.imshow(sarcastic_wordcloud, interpolation='bilinear')
plt.title('Sarcastic Headlines Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(non_sarcastic_wordcloud, interpolation='bilinear')
plt.title('Non-Sarcastic Headlines Word Cloud')
plt.axis('off')

plt.show()


In [None]:
df.head()

In [None]:
# Preprocessing
df.drop(columns=['article_link'], inplace=True)  # Drop the 'article_link' column
df.dropna(inplace=True)  # Drop any rows with missing values
df['headline'] = df['headline'].str.lower()  # Convert text to lowercase


In [None]:
# Basic text preprocessing
import re
# Initialize stopwords, lemmatizer, and stemmer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', text)

        # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Stem
    tokens = [stemmer.stem(word) for word in tokens]

    # Rejoin tokens into a single string
    text = ' '.join(tokens)
    return text

df['headline'] = df['headline'].apply(preprocess_text)

# Check for any missing values
df.isnull().sum()

# Apply preprocessing to the 'headline' column
df['headline'] = df['headline'].apply(preprocess_text)

# Display the first few rows after preprocessing
print("\nAfter Preprocessing:")
print(df.head())


### ***EDA***





In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import pandas as pd
import json
import nltk

# Download NLTK data
nltk.download('punkt')


In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt



# Separate sarcastic and non-sarcastic headlines
sarcastic_headlines = df[df['is_sarcastic'] == 1]['headline']
non_sarcastic_headlines = df[df['is_sarcastic'] == 0]['headline']

# Create word clouds
sarcastic_text = ' '.join(sarcastic_headlines)
non_sarcastic_text = ' '.join(non_sarcastic_headlines)

sarcastic_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(sarcastic_text)
non_sarcastic_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(non_sarcastic_text)

# Plot the word clouds
plt.figure(figsize=(16, 8))

plt.subplot(1, 2, 1)
plt.imshow(sarcastic_wordcloud, interpolation='bilinear')
plt.title('a) Sarcastic Headlines Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(non_sarcastic_wordcloud, interpolation='bilinear')
plt.title('b) Non-Sarcastic Headlines Word Cloud')
plt.axis('off')

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Calculate the length of each headline
df['headline_length'] = df['headline'].apply(len)

# Separate the lengths based on the target variable
sarcastic_lengths = df[df['is_sarcastic'] == 1]['headline_length']
non_sarcastic_lengths = df[df['is_sarcastic'] == 0]['headline_length']

# Plot the histogram
plt.figure(figsize=(6, 6))

plt.hist(non_sarcastic_lengths, bins=30, alpha=0.5, label='Non-Sarcastic', color='blue', edgecolor='black')
plt.hist(sarcastic_lengths, bins=30, alpha=0.5, label='Sarcastic', color='red', edgecolor='black')

plt.xlabel('Headline Length')
plt.ylabel('Number of Headlines')
plt.title('Histogram of Headline Lengths by Sarcasm')
plt.legend(loc='upper right')

plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np



# Calculate the length of each headline
df['headline_length'] = df['headline'].apply(len)

# Separate the lengths based on the target variable
sarcastic_lengths = df[df['is_sarcastic'] == 1]['headline_length']
non_sarcastic_lengths = df[df['is_sarcastic'] == 0]['headline_length']

# Calculate the mean lengths
mean_sarcastic_length = sarcastic_lengths.mean()
mean_non_sarcastic_length = non_sarcastic_lengths.mean()

# Plot the histogram
plt.figure(figsize=(6, 6))

# Histogram for non-sarcastic headlines
plt.hist(non_sarcastic_lengths, bins=30, alpha=0.5, label='Non-Sarcastic', color='blue', edgecolor='black')
# Plot the mean line for non-sarcastic
plt.axvline(mean_non_sarcastic_length, color='blue', linestyle='dashed', linewidth=2)
plt.text(mean_non_sarcastic_length + 2, plt.ylim()[1] * 0.9, f'Mean: {mean_non_sarcastic_length:.2f}', color='blue')

# Histogram for sarcastic headlines
plt.hist(sarcastic_lengths, bins=30, alpha=0.5, label='Sarcastic', color='red', edgecolor='black')
# Plot the mean line for sarcastic
plt.axvline(mean_sarcastic_length, color='red', linestyle='dashed', linewidth=2)
plt.text(mean_sarcastic_length + 2, plt.ylim()[1] * 0.9, f'Mean: {mean_sarcastic_length:.2f}', color='red')

plt.xlabel('Headline Length')
plt.ylabel('Number of Headlines')
plt.title('Histogram of Headline Lengths by Sarcasm with Mean Lengths')
plt.legend(loc='upper right')

plt.grid(True)
plt.show()


In [None]:
# Counts of sarcastic and non-sarcastic headlines
count_sarcastic = df['is_sarcastic'].value_counts()

# Import seaborn
import seaborn as sns


# Bar plot
plt.figure(figsize=(6, 4))
sns.barplot(x=count_sarcastic.index, y=count_sarcastic.values, palette='viridis')
plt.title('Counts of Sarcastic and Non-Sarcastic Headlines')
plt.xlabel('Sarcasm')
plt.ylabel('Count')
plt.xticks([0, 1], ['Non-Sarcastic', 'Sarcastic'])
plt.show()


In [None]:
import pandas as pd

# Assuming df is already loaded with concatenated datasets
# Extracting 3 sarcastic sentences
sarcastic_samples = df[df['is_sarcastic'] == 1]['headline'].sample(3).tolist()

# Extracting 3 non-sarcastic sentences
non_sarcastic_samples = df[df['is_sarcastic'] == 0]['headline'].sample(3).tolist()

# Creating a DataFrame to display the samples
data = {
    'Sentence': sarcastic_samples + non_sarcastic_samples,
    'Expression': ['sarcastic'] * 3 + ['non-sarcastic'] * 3
}

df_samples = pd.DataFrame(data)

# Displaying the table
print(df_samples)


In [None]:
# Pie chart for target variable distribution
plt.figure(figsize=(5, 5))
plt.pie(count_sarcastic, labels=['Non-Sarcastic', 'Sarcastic'], autopct='%1.1f%%', startangle=140, colors=['lightblue', 'lightgreen'])
plt.title('Target Variable Distribution')
plt.axis('equal')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from collections import Counter # Import the Counter class


In [None]:
# Tokenize the headlines
df['tokens'] = df['headline'].apply(nltk.word_tokenize)

# Flatten the list of tokens and count word frequencies
all_words = [word for tokens in df['tokens'] for word in tokens]
word_freq = Counter(all_words)

# Get the 20 most common words
most_common_words = word_freq.most_common(20)

# Bar plot
plt.figure(figsize=(8, 4))
sns.barplot(x=[word[0] for word in most_common_words], y=[word[1] for word in most_common_words], palette='viridis')
plt.title('Most Used Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(all_words))

# Display the word cloud
plt.figure(figsize=(6, 4))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Headlines')
plt.axis('off')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

# Train-test split
X = df['headline']
y = df['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


### ***ML+POS***

In [None]:
# Define models
models = {
    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(),
    'LightGBM': lgb.LGBMClassifier()
}

# Function to train and evaluate models for a given trait
def train_and_evaluate_model(X_train, X_test, y_train, y_test, sarcasm):
    print(f"Training models for {sarcasm}...")
    for name, model in models.items():
        print(f"Training {name} for {sarcasm}...")
        # Initialize the TfidfVectorizer
        vectorizer = TfidfVectorizer(max_features=1000)
        # Convert POS-tagged text to numerical features for training and testing data
        X_train_vectorized = vectorizer.fit_transform(X_train)
        X_test_vectorized = vectorizer.transform(X_test)

        model.fit(X_train_vectorized, y_train)
        y_pred = model.predict(X_test_vectorized)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy for {name} for {sarcasm}: {accuracy}")
        print(f"Classification report for {name} for {sarcasm}:")
        print(classification_report(y_test, y_pred))
        print("----------------------------------------------------")




In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import nltk
from nltk import pos_tag, word_tokenize
from sklearn.svm import SVC

# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# POS tagging feature extractor
class PosTagTransformer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, texts):
        return [' '.join([tag for word, tag in pos_tag(word_tokenize(text))]) for text in texts]

# Define the vectorizers
tfidf_vectorizer = TfidfVectorizer()
ngram_vectorizer = CountVectorizer(ngram_range=(1,2))
pos_tag_vectorizer = TfidfVectorizer()

# Combine the features
combined_features = FeatureUnion([
    ('tfidf', tfidf_vectorizer),
    ('ngram', ngram_vectorizer),
    ('pos_tag', Pipeline([
        ('pos_transform', PosTagTransformer()),
        ('tfidf', pos_tag_vectorizer)
    ]))
])

# Apply combined features
X_train_features = combined_features.fit_transform(X_train)
X_test_features = combined_features.transform(X_test)


In [None]:
train_and_evaluate_model(X_train, X_test, y_train, y_test, "Sarcasm Detection")

In [None]:
# Train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_features, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test_features)

# Evaluate the model
from sklearn.metrics import confusion_matrix, classification_report

conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

In [None]:
# POS tagging feature extractor
class PosTagTransformer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, texts):
        return [' '.join([tag for word, tag in pos_tag(word_tokenize(text))]) for text in texts]

# Apply POS tagging and TF-IDF
pos_tag_vectorizer = TfidfVectorizer()
pos_transformer = PosTagTransformer()
X_train_pos = pos_transformer.transform(X_train)
X_test_pos = pos_transformer.transform(X_test)

X_train_pos_tfidf = pos_tag_vectorizer.fit_transform(X_train_pos)
X_test_pos_tfidf = pos_tag_vectorizer.transform(X_test_pos)

# Train SVM with POS tagging features
svm_pos = SVC(kernel='linear', random_state=42)
svm_pos.fit(X_train_pos_tfidf, y_train)
y_pred_pos = svm_pos.predict(X_test_pos_tfidf)

# Evaluate the model
conf_matrix_pos = confusion_matrix(y_test, y_pred_pos)
class_report_pos = classification_report(y_test, y_pred_pos)

print("Confusion Matrix (POS Tagging):\n", conf_matrix_pos)
print("\nClassification Report (POS Tagging):\n", class_report_pos)


### ***N-GRAM***

In [None]:
# N-gram Vectorizer
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
X_test_ngram = ngram_vectorizer.transform(X_test)

# Train SVM with N-gram features
svm_ngram = SVC(kernel='linear', random_state=42)
svm_ngram.fit(X_train_ngram, y_train)
y_pred_ngram = svm_ngram.predict(X_test_ngram)

# Evaluate the model
conf_matrix_ngram = confusion_matrix(y_test, y_pred_ngram)
class_report_ngram = classification_report(y_test, y_pred_ngram)

print("Confusion Matrix (N-grams):\n", conf_matrix_ngram)
print("\nClassification Report (N-grams):\n", class_report_ngram)


### ***TF-IDF***

In [None]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

for model_name, model in models.items():
    print(f"\nTraining and evaluating {model_name} with TF-IDF features...")
    train_evaluate_model(model, X_train_tfidf, X_test_tfidf, y_train, y_test, 'TF-IDF')


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Initialize a figure for the combined ROC curve
plt.figure(figsize=(10, 8))

# Initialize arrays to store combined false positive rates and true positive rates
all_fpr = np.linspace(0, 1, 100)
mean_tpr = 0.0


# Plot ROC curve for each classifier and calculate the mean true positive rate
for name, model in models.items():
    # Fit the model
    model.fit(X_train_tfidf, y_train)

    # Get scores (decision function output) on the test set
    if hasattr(model, "decision_function"):
        scores = model.decision_function(X_test_tfidf)
    else:
        scores = model.predict(X_test_tfidf)

    # Convert scores into probabilities
    y_pred_proba = (scores - scores.min()) / (scores.max() - scores.min())

    # Compute ROC curve and ROC area for Sarcasm Detection - NGRAM
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve for the model
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

    # Calculate mean true positive rate
    mean_tpr += np.interp(all_fpr, fpr, tpr)


# Calculate the mean true positive rate across all classifiers
mean_tpr /= len(models)
mean_auc = auc(all_fpr, mean_tpr)

    # Plot the combined ROC curve
plt.plot(all_fpr, mean_tpr, color='black', linestyle='--', lw=2, label=f'Combined ROC (AUC = {mean_auc:.2f})')

# Add labels and legend
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Combined Receiver Operating Characteristic (ROC) Curve for TF-IDF')
plt.legend(loc="lower right")

# Show plot
plt.grid(True)
plt.show()



### ***POS***

In [None]:
class PosTagTransformer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, texts):
        return [' '.join([tag for word, tag in pos_tag(word_tokenize(text))]) for text in texts]


In [None]:
# Apply POS tagging and TF-IDF
pos_tag_vectorizer = TfidfVectorizer()
pos_transformer = PosTagTransformer()
X_train_pos = pos_transformer.transform(X_train)
X_test_pos = pos_transformer.transform(X_test)

X_train_pos_tfidf = pos_tag_vectorizer.fit_transform(X_train_pos)
X_test_pos_tfidf = pos_tag_vectorizer.transform(X_test_pos)

for model_name, model in models.items():
    print(f"\nTraining and evaluating {model_name} with POS Tagging features...")
    train_evaluate_model(model, X_train_pos_tfidf, X_test_pos_tfidf, y_train, y_test, 'POS Tagging')


In [None]:
# POS Tagging Transformer
class PosTagTransformer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, texts):
        return [' '.join([tag for word, tag in pos_tag(word_tokenize(text))]) for text in texts]

# Apply POS tagging
pos_transformer = PosTagTransformer()
X_train_pos = pos_transformer.transform(X_train)
X_test_pos = pos_transformer.transform(X_test)

# TF-IDF Vectorizer for POS tags
pos_tag_vectorizer = TfidfVectorizer()
X_train_pos_tfidf = pos_tag_vectorizer.fit_transform(X_train_pos)
X_test_pos_tfidf = pos_tag_vectorizer.transform(X_test_pos)

# Initialize a figure for the combined ROC curve for POS Tagging features
plt.figure(figsize=(10, 8))

# Initialize arrays to store combined false positive rates and true positive rates
all_fpr = np.linspace(0, 1, 100)
mean_tpr = 0.0

# Plot ROC curve for each classifier and calculate the mean true positive rate for POS Tagging features
for name, model in models.items():
    # Fit the model
    model.fit(X_train_pos_tfidf, y_train)

    # Get scores (decision function output) on the test set
    if hasattr(model, "decision_function"):
        scores = model.decision_function(X_test_pos_tfidf)
    else:
        scores = model.predict(X_test_pos_tfidf)

    # Convert scores into probabilities
    y_pred_proba = (scores - scores.min()) / (scores.max() - scores.min())

    # Compute ROC curve and ROC area for Sarcasm Detection - POS Tagging
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve for the model
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

    # Calculate mean true positive rate
    mean_tpr += np.interp(all_fpr, fpr, tpr)

# Calculate the mean true positive rate across all classifiers
mean_tpr /= len(models)
mean_auc = auc(all_fpr, mean_tpr)

# Plot the combined ROC curve
plt.plot(all_fpr, mean_tpr, color='black', linestyle='--', lw=2, label=f'Combined ROC (AUC = {mean_auc:.2f})')

# Add labels and legend
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Combined Receiver Operating Characteristic (ROC) Curve for POS Tagging')
plt.legend(loc="lower right")

# Show plot
plt.grid(True)
plt.show()


In [None]:
# N-gram Vectorizer
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
X_test_ngram = ngram_vectorizer.transform(X_test)

for model_name, model in models.items():
    print(f"\nTraining and evaluating {model_name} with N-gram features...")
    train_evaluate_model(model, X_train_ngram, X_test_ngram, y_train, y_test, 'N-grams')


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc


# Initialize a figure for the combined ROC curve
plt.figure(figsize=(10, 8))

# Initialize arrays to store combined false positive rates and true positive rates
all_fpr = np.linspace(0, 1, 100)
mean_tpr = 0.0


# Plot ROC curve for each classifier and calculate the mean true positive rate
for name, model in models.items():
    # Fit the model
    model.fit(X_train_ngram, y_train)

    # Get scores (decision function output) on the test set
    if hasattr(model, "decision_function"):
        scores = model.decision_function(X_test_ngram)
    else:
        scores = model.predict(X_test_ngram)

    # Convert scores into probabilities
    y_pred_proba = (scores - scores.min()) / (scores.max() - scores.min())

    # Compute ROC curve and ROC area for Sarcasm Detection - NGRAM
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve for the model
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

    # Calculate mean true positive rate
    mean_tpr += np.interp(all_fpr, fpr, tpr)


# Calculate the mean true positive rate across all classifiers
mean_tpr /= len(models)
mean_auc = auc(all_fpr, mean_tpr)

    # Plot the combined ROC curve
plt.plot(all_fpr, mean_tpr, color='black', linestyle='--', lw=2, label=f'Combined ROC (AUC = {mean_auc:.2f})')

# Add labels and legend
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Combined Receiver Operating Characteristic (ROC) Curve for N-Gram')
plt.legend(loc="lower right")

# Show plot
plt.grid(True)
plt.show()



In [None]:
# Define models
models = {
    'LightGBM': lgb.LGBMClassifier()
}

# Function to train and evaluate models for a given trait
def train_and_evaluate_model(X_train, X_test, y_train, y_test, sarcasm):
    print(f"Training models for {sarcasm}...")
    for name, model in models.items():
        print(f"Training {name} for {sarcasm}...")
        # Initialize the TfidfVectorizer
        vectorizer = TfidfVectorizer(max_features=1000)
        # Convert POS-tagged text to numerical features for training and testing data
        X_train_vectorized = vectorizer.fit_transform(X_train)
        X_test_vectorized = vectorizer.transform(X_test)

        model.fit(X_train_vectorized, y_train)
        y_pred = model.predict(X_test_vectorized)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy for {name} for {sarcasm}: {accuracy}")
        print(f"Classification report for {name} for {sarcasm}:")
        print(classification_report(y_test, y_pred))
        print("----------------------------------------------------")


# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

for model_name, model in models.items():
    print(f"\nTraining and evaluating {model_name} with TF-IDF features...")
train_and_evaluate_model(X_train, X_test, y_train, y_test, "TF-IDF")


### ***Deep Models***

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, GRU, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, SimpleRNN
from tensorflow.keras.optimizers import Adam
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.fasttext import FastText
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, Dropout
from torchtext.vocab import GloVe
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer


In [None]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
maxlen = 100  # You can adjust this value
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1


In [None]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=[nltk.word_tokenize(text) for text in X_train], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = w2v_model.wv

# Create an embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]


In [None]:
# Load the GloVe model
glove = GloVe(name='6B', dim=100)

# Create an embedding matrix for Glove
embedding_matrix_glove = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = glove[word]
    if embedding_vector is not None:
        embedding_matrix_glove[i] = embedding_vector


In [None]:
!pip install sentence-transformers

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, GRU, Conv1D, GlobalMaxPooling1D, Embedding, SimpleRNN
from keras.utils import to_categorical

# Prepare data
X = df['headline'].values
y = df['is_sarcastic'].values

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Sentence Transformer model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Generate sentence embeddings
X_train_embeddings = sbert_model.encode(X_train)
X_test_embeddings = sbert_model.encode(X_test)


In [None]:
# Define model architectures using sentence embeddings
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(128))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def create_bilstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=input_shape))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def create_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(128, 5, activation='relu', input_shape=input_shape))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def create_rnn_model(input_shape):
    model = Sequential()
    model.add(SimpleRNN(128, return_sequences=True, input_shape=input_shape))
    model.add(SimpleRNN(128))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# Train and evaluate models
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    print(classification_report(y_test_classes, y_pred_classes))
    cm = confusion_matrix(y_test_classes, y_pred_classes)
    print("Confusion Matrix:\n", cm)
    return model

In [None]:
# Create a dictionary of models
models = {
    "LSTM": create_lstm_model((X_train_embeddings.shape[1], 1)),
    "Bi-LSTM": create_bilstm_model((X_train_embeddings.shape[1], 1)),
    "CNN": create_cnn_model((X_train_embeddings.shape[1], 1)),
    "RNN": create_rnn_model((X_train_embeddings.shape[1], 1))

}


In [None]:

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    trained_model = train_and_evaluate_model(model, X_train_embeddings, y_train, X_test_embeddings, y_test)
    results[name] = trained_model

# Print results
for name, result in results.items():
    print(f"{name} model trained and evaluated.")


In [None]:
from sentence_transformers import SentenceTransformer

# Load Sentence Transformer model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Convert headlines to sentence embeddings
X_embeddings = sbert_model.encode(df['headline'].tolist())

# Continue with the rest of your preprocessing and model training steps


In [None]:
# Define input shape
input_shape = (X_train_embeddings.shape[1], 1)

# Reshape data for RNN/CNN models
X_train_embeddings_reshaped = np.expand_dims(X_train_embeddings, axis=2)
X_test_embeddings_reshaped = np.expand_dims(X_test_embeddings, axis=2)

In [None]:
def train_and_evaluate_model(model, X_train_pad, y_train, X_test_pad, y_test):
    model.fit(X_train_pad, y_train, epochs=100, batch_size=32, validation_split=0.2)
    y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)
    return model


In [None]:
# Word2Vec + LSTM
print("Word2Vec + LSTM")
w2v_lstm_model = create_lstm_model(vocab_size, embedding_matrix)
train_and_evaluate_model(w2v_lstm_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# GloVe + LSTM
print("GloVe + LSTM")
glove_lstm_model = create_lstm_model(vocab_size, embedding_matrix_glove)
train_and_evaluate_model(glove_lstm_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# FastText + LSTM
print("FastText + LSTM")
fasttext_lstm_model = create_lstm_model(vocab_size, embedding_matrix_fasttext)
train_and_evaluate_model(fasttext_lstm_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
print("Word2Vec + Bi-LSTM")
w2v_bilstm_model = create_lstm_model(vocab_size, embedding_matrix)
train_and_evaluate_model(w2v_bilstm_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# GloVe + Bi-LSTM
print("GloVe + Bi-LSTM")
glove_bilstm_model = create_bilstm_model(vocab_size, embedding_matrix_glove)
train_and_evaluate_model(glove_bilstm_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# FastText + Bi-LSTM
print("FastText + Bi-LSTM")
fasttext_bilstm_model = create_bilstm_model(vocab_size, embedding_matrix_fasttext)
train_and_evaluate_model(fasttext_bilstm_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# Word2Vec + CNN
print("Word2Vec + CNN")
w2v_cnn_model = create_cnn_model(vocab_size, embedding_matrix)
train_and_evaluate_model(w2v_cnn_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# GloVe + CNN
print("GloVe + CNN")
glove_cnn_model = create_cnn_model(vocab_size, embedding_matrix_glove)
train_and_evaluate_model(glove_cnn_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# FastText + CNN
print("FastText + CNN")
fasttext_cnn_model = create_cnn_model(vocab_size, embedding_matrix_fasttext)
train_and_evaluate_model(fasttext_cnn_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# Word2Vec + RNN
print("Word2Vec + RNN")
w2v_rnn_model = create_rnn_model(vocab_size, embedding_matrix)
train_and_evaluate_model(w2v_rnn_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# GloVe + RNN
print("GloVe + RNN")
glove_rnn_model = create_rnn_model(vocab_size, embedding_matrix_glove)
train_and_evaluate_model(glove_rnn_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# FastText + RNN
print("FastText + RNN")
fasttext_rnn_model = create_rnn_model(vocab_size, embedding_matrix_fasttext)
train_and_evaluate_model(fasttext_rnn_model, X_train_pad, y_train, X_test_pad, y_test)

In [None]:
# LSTM Model with Sentence-BERT
def create_lstm_model_sbert(input_shape):
    model = Sequential()
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, input_shape=(input_shape,)))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Bi-LSTM Model with Sentence-BERT
def create_bilstm_model_sbert(input_shape):
    model = Sequential()
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, input_shape=(input_shape,))))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# CNN Model with Sentence-BERT
def create_CNN_model_sbert(input_shape):
    model = Sequential()
    model.add(SpatialDropout1D(0.2))
    model.add(CNN(100, dropout=0.2, recurrent_dropout=0.2, input_shape=(input_shape,)))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

    # RNN Model with Sentence-BERT
def create_RNN_model_sbert(input_shape):
    model = Sequential()
    model.add(SpatialDropout1D(0.2))
    model.add(RNN(100, dropout=0.2, recurrent_dropout=0.2, input_shape=(input_shape,)))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# Reshape the Sentence-BERT embeddings to add the time_steps dimension
X_train_embeddings = np.expand_dims(X_train_embeddings, axis=1)
X_test_embeddings = np.expand_dims(X_test_embeddings, axis=1)

# Now input_shape should include the time_steps dimension
input_shape = (X_train_embeddings.shape[1], X_train_embeddings.shape[2])

# Train and evaluate LSTM with Sentence-BERT
lstm_model_sbert = create_lstm_model_sbert(input_shape)
lstm_model_sbert.fit(X_train_embeddings, y_train, epochs=35, batch_size=64, validation_data=(X_test_embeddings, y_test), verbose=2)

# Train and evaluate Bi-LSTM with Sentence-BERT
bilstm_model_sbert = create_bilstm_model_sbert(input_shape)
bilstm_model_sbert.fit(X_train_embeddings, y_train, epochs=35, batch_size=64, validation_data=(X_test_embeddings, y_test), verbose=2)

# Train and evaluate CNN with Sentence-BERT
CNN_model_sbert = create_CNN_model_sbert(input_shape)
CNN_model_sbert.fit(X_train_embeddings, y_train, epochs=35, batch_size=64, validation_data=(X_test_embeddings, y_test), verbose=2)

# Train and evaluate RNN with Sentence-BERT
RNN_model_sbert = create_RNN_model_sbert(input_shape)
RNN_model_sbert.fit(X_train_embeddings, y_train, epochs=35, batch_size=64, validation_data=(X_test_embeddings, y_test), verbose=2)


In [None]:
print("LSTM with Sentence-BERT Classification Report:\n", classification_report(y_true, y_pred_lstm_sbert))


In [None]:
print("Bi-LSTM with Sentence-BERT Classification Report:\n", classification_report(y_true, y_pred_bilstm_sbert))


In [None]:
print("CNN with Sentence-BERT Classification Report:\n", classification_report(y_true, y_pred_CNN_sbert))


In [None]:
print("RNN with Sentence-BERT Classification Report:\n", classification_report(y_true, y_pred_RNN_sbert))
