In [1]:
import numpy as np
import pandas as pd

In [5]:
csv_file_path = 'Train_Dataset.csv'

# Load the CSV file into a DataFrame
data = pd.read_csv(csv_file_path)

# Display the first few rows of the DataFrame
print(data.head())

   Unnamed: 0                                              tweet  sarcastic  \
0           0  The only thing I got from college is a caffein...          1   
1           1  I love it when professors draw a big question ...          1   
2           2  Remember the hundred emails from companies whe...          1   
3           3  Today my pop-pop told me I was not “forced” to...          1   
4           4  @VolphanCarol @littlewhitty @mysticalmanatee I...          1   

                                            rephrase  sarcasm  irony  satire  \
0  College is really difficult, expensive, tiring...      0.0    1.0     0.0   
1  I do not like when professors don’t write out ...      1.0    0.0     0.0   
2  I, at the bare minimum, wish companies actuall...      0.0    1.0     0.0   
3  Today my pop-pop told me I was not "forced" to...      1.0    0.0     0.0   
4  I would say Ted Cruz is an asshole and doesn’t...      1.0    0.0     0.0   

   understatement  overstatement  rhetorical

In [7]:
print("Null Values\n",data.isnull().sum())
print("\nData Shape: " , data.shape)

Null Values
 Unnamed: 0                0
tweet                     1
sarcastic                 0
rephrase               2601
sarcasm                2601
irony                  2601
satire                 2601
understatement         2601
overstatement          2601
rhetorical_question    2601
dtype: int64

Data Shape:  (3468, 10)


In [23]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r'#\w+', '', text)  # remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)
    
df['tweet'].fillna('', inplace=True)
df['tweet'] = df['tweet'].apply(preprocess_text)

# Extract features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['tweet'])
y = df['sarcastic']  # assume 'label' is the column with the sarcasm labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['tweet'].fillna('', inplace=True)


Accuracy: 0.7204610951008645
Classification Report:
              precision    recall  f1-score   support

           0       0.72      1.00      0.84       495
           1       1.00      0.03      0.05       199

    accuracy                           0.72       694
   macro avg       0.86      0.51      0.44       694
weighted avg       0.80      0.72      0.61       694

Confusion Matrix:
[[495   0]
 [194   5]]


In [19]:
print(df.columns)

Index(['Unnamed: 0', 'tweet', 'sarcastic', 'rephrase', 'sarcasm', 'irony',
       'satire', 'understatement', 'overstatement', 'rhetorical_question'],
      dtype='object')


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix
from textblob import TextBlob
import joblib
import re
import numpy as np

# Load the data
df = pd.read_csv('Train_Dataset.csv')

# Handle missing values and data cleaning
df.fillna('', inplace=True)

# Feature Engineering
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Create sentiment features
df['original_sentiment'] = df['tweet'].apply(get_sentiment)
df['rephrased_sentiment'] = df['rephrase'].apply(get_sentiment)  # Assuming 'rephrase' is the correct column name
df['sentiment_difference'] = df['original_sentiment'] - df['rephrased_sentiment']

# Text feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(df['tweet']).toarray()

# Convert text features to DataFrame
X_text_df = pd.DataFrame(X_text)

# Ensure additional labels are numeric
df[['irony', 'satire', 'understatement', 'overstatement']] = df[['irony', 'satire', 'understatement', 'overstatement']].apply(pd.to_numeric, errors='coerce').fillna(0)

# Concatenate features
X = pd.concat([X_text_df, df[['irony', 'satire', 'understatement', 'overstatement', 'sentiment_difference']]], axis=1)

# Convert all column names to strings to avoid TypeError
X.columns = [str(col) for col in X.columns]

# Labels for sarcasm detection and type classification
y_sarcasm = df['sarcastic']
y_types = df[['irony', 'satire', 'understatement', 'overstatement']]

# Split the data
X_train, X_test, y_train_sarcasm, y_test_sarcasm = train_test_split(X, y_sarcasm, test_size=0.2, random_state=42)
X_train, X_test, y_train_types, y_test_types = train_test_split(X, y_types, test_size=0.2, random_state=42)

# Model Building
sarcasm_model = LogisticRegression(max_iter=200)
sarcasm_model.fit(X_train, y_train_sarcasm)

# Multi-output classification for types of sarcasm
type_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
type_model.fit(X_train, y_train_types)

# Evaluation for sarcasm detection
y_pred_sarcasm = sarcasm_model.predict(X_test)
print("Sarcasm Detection Report")
print(classification_report(y_test_sarcasm, y_pred_sarcasm))
print(confusion_matrix(y_test_sarcasm, y_pred_sarcasm))

# Evaluation for type classification
y_pred_types = type_model.predict(X_test)
print("Types of Sarcasm Detection Report")
print(classification_report(y_test_types, y_pred_types))

# Save the models
joblib.dump(sarcasm_model, 'sarcasm_detection_model.pkl')
joblib.dump(type_model, 'sarcasm_type_detection_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    return text

# Function to get sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Preprocess the tweet
def preprocess_tweet(tweet, rephrased_tweet=None):
    tweet_cleaned = clean_text(tweet)
    
    if rephrased_tweet:
        rephrased_cleaned = clean_text(rephrased_tweet)
        rephrased_sentiment = get_sentiment(rephrased_cleaned)
    else:
        rephrased_cleaned = ""
        rephrased_sentiment = 0  # Use 0 sentiment for missing rephrased tweets
    
    tweet_sentiment = get_sentiment(tweet_cleaned)
    sentiment_difference = tweet_sentiment - rephrased_sentiment
    
    tweet_vector = tfidf.transform([tweet_cleaned]).toarray()
    
    feature_vector = pd.DataFrame(tweet_vector)
    feature_vector['irony'] = 0  # Initialize with 0; update if available
    feature_vector['satire'] = 0  # Initialize with 0; update if available
    feature_vector['understatement'] = 0  # Initialize with 0; update if available
    feature_vector['overstatement'] = 0  # Initialize with 0; update if available
    feature_vector['sentiment_difference'] = sentiment_difference
    
    # Convert column names to strings
    feature_vector.columns = [str(col) for col in feature_vector.columns]
    
    return feature_vector

# Function to classify the tweet
def classify_tweet(tweet, rephrased_tweet=None):
    features = preprocess_tweet(tweet, rephrased_tweet)
    
    sarcasm_pred = sarcasm_model.predict(features)
    types_pred = type_model.predict(features)
    
    sarcasm_label = "Sarcastic" if sarcasm_pred[0] else "Non-Sarcastic"
    
    types = []
    if types_pred[0][0]:
        types.append("Irony")
    if types_pred[0][1]:
        types.append("Satire")
    if types_pred[0][2]:
        types.append("Understatement")
    if types_pred[0][3]:
        types.append("Overstatement")
    
    return sarcasm_label, types

# Test with a sample tweet
sample_tweet = "Oh great, another rainy day i have such an important event and it would be so nice to wear wet clothes!!."

# During testing, we won't have a rephrased tweet
sarcasm_label, types = classify_tweet(sample_tweet)
print("Sarcasm Label:", sarcasm_label)
print("Types of Sarcasm:", types)


  df.fillna('', inplace=True)


Sarcasm Detection Report
              precision    recall  f1-score   support

           0       0.78      1.00      0.88       497
           1       1.00      0.28      0.44       197

    accuracy                           0.80       694
   macro avg       0.89      0.64      0.66       694
weighted avg       0.84      0.80      0.75       694

[[497   0]
 [141  56]]
Types of Sarcasm Detection Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        39
           1       1.00      1.00      1.00         2
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00        14

   micro avg       1.00      0.96      0.98        57
   macro avg       0.75      0.75      0.75        57
weighted avg       0.96      0.96      0.96        57
 samples avg       0.08      0.08      0.08        57



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Sarcasm Label: Non-Sarcastic
Types of Sarcasm: []


In [3]:
pip install textblob

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   ---------------------- ---------------- 368.6/626.3 kB 11.2 MB/s eta 0:00:01
   ---------------------------------------- 626.3/626.3 kB 7.8 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0


In [19]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)
print(df.isnull().sum())

X = df['tweet']
y = df['sarcastic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', MultinomialNB())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

def predict_sarcasm(tweet):
    tweet = clean_text(tweet)
    return model.predict([tweet])[0]

test_tweet = "This is just amazing, I love waiting in line for hours."
print(f"Tweet: {test_tweet}\nPredicted Label: {predict_sarcasm(test_tweet)}")


id                        0
tweet                     0
sarcastic                 0
rephrase               2600
sarcasm                2600
irony                  2600
satire                 2600
understatement         2600
overstatement          2600
rhetorical_question    2600
dtype: int64
Accuracy: 0.7161383285302594
Confusion Matrix:
 [[497   0]
 [197   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      1.00      0.83       497
           1       0.00      0.00      0.00       197

    accuracy                           0.72       694
   macro avg       0.36      0.50      0.42       694
weighted avg       0.51      0.72      0.60       694

Tweet: This is just amazing, I love waiting in line for hours.
Predicted Label: 0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
import pandas as pd
import numpy as np
import re
import string
from textblob import TextBlob
import torch
from transformers import pipeline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Map numeric labels to text labels
label_mapping = {0: 'non-sarcastic', 1: 'sarcastic'}
df['label'] = df['label'].map(label_mapping)

# Check for missing values
print(df.isnull().sum())

# Define the features and labels
X = df['tweet']
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Common Sense Reasoning using transformers pipeline (ConceptNet, COMET, etc.)
# Initialize the COMET model (or any other common sense model)
common_sense_model = pipeline('text-classification', model='roberta-base-openai-detector')

def get_common_sense_features(tweet):
    # For simplicity, we'll assume the model returns a probability score of how much common sense the tweet makes
    result = common_sense_model(tweet)[0]
    return result['score']

# Sentiment Analysis using TextBlob
def get_sentiment_features(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

# Create new features based on common sense reasoning and sentiment analysis
X_train_common_sense = X_train.apply(get_common_sense_features)
X_test_common_sense = X_test.apply(get_common_sense_features)
X_train_sentiment = X_train.apply(get_sentiment_features)
X_test_sentiment = X_test.apply(get_sentiment_features)

# Combine the new features with the original features
X_train_combined = pd.DataFrame({
    'tweet': X_train,
    'common_sense': X_train_common_sense,
    'sentiment': X_train_sentiment
})
X_test_combined = pd.DataFrame({
    'tweet': X_test,
    'common_sense': X_test_common_sense,
    'sentiment': X_test_sentiment
})

# Create the pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', MultinomialNB())
])

# Train the model
model.fit(X_train_combined['tweet'], y_train)

# Make predictions
y_pred = model.predict(X_test_combined['tweet'])

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

def predict_sarcasm(tweet):
    tweet_cleaned = clean_text(tweet)
    common_sense_score = get_common_sense_features(tweet_cleaned)
    sentiment_score = get_sentiment_features(tweet_cleaned)
    tweet_combined = pd.DataFrame({
        'tweet': [tweet_cleaned],
        'common_sense': [common_sense_score],
        'sentiment': [sentiment_score]
    })
    return model.predict(tweet_combined['tweet'])[0]

# Test the model with new tweets
test_tweet = "This is just amazing, I love waiting in line for hours."
print(f"Tweet: {test_tweet}\nPredicted Label: {predict_sarcasm(test_tweet)}")


ModuleNotFoundError: No module named 'torch'

In [23]:
pip install torch

^C
Note: you may need to restart the kernel to use updated packages.


In [28]:
import pandas as pd
import numpy as np
import re
import string
from textblob import TextBlob

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Map numeric labels to text labels
label_mapping = {0: 'non-sarcastic', 1: 'sarcastic'}
df['sarcastic'] = df['sarcastic'].map(label_mapping)

def get_sentiment_features(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

def split_and_analyze_sentiment(tweet):
    parts = re.split(r'[.,;!?]', tweet)
    sentiment_scores = [get_sentiment_features(part) for part in parts if part.strip()]
    return sentiment_scores

def extract_features(tweet):
    cleaned_tweet = clean_text(tweet)
    sentiment_scores = split_and_analyze_sentiment(cleaned_tweet)
    
    features = {
        'sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else 0,
        'sentiment_stddev': np.std(sentiment_scores) if sentiment_scores else 0
    }
    return features

# Extract features for the dataset
features_df = pd.DataFrame([extract_features(tweet) for tweet in df['tweet']])

X = features_df
y = df['sarcastic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
model = Pipeline([
    ('clf', MultinomialNB())
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

def predict_sarcasm(tweet):
    features = extract_features(tweet)
    features_df = pd.DataFrame([features])
    return model.predict(features_df)[0]

# Test the model with new tweets
test_tweet = "This is just amazing, I love waiting in line for hours."
print(f"Tweet: {test_tweet}\nPredicted Label: {predict_sarcasm(test_tweet)}")


ValueError: Negative values in data passed to MultinomialNB (input X)

In [30]:
pip install requests


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import numpy as np
import re
import string
from textblob import TextBlob
import requests
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Map numeric labels to text labels
label_mapping = {0: 'non-sarcastic', 1: 'sarcastic'}
df['sarcastic'] = df['sarcastic'].map(label_mapping)

def get_sentiment_features(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

def split_and_analyze_sentiment(tweet):
    parts = re.split(r'[.,;!?]', tweet)
    sentiment_scores = [get_sentiment_features(part) for part in parts if part.strip()]
    return sentiment_scores

# def get_common_sense_score(tweet):
#     url = "http://api.conceptnet.io/c/en/"
#     words = tweet.split()
#     scores = []
#     for word in words:
#         response = requests.get(url + word).json()
#         edges = response.get('edges', [])
#         for edge in edges:
#             if edge.get('rel', {}).get('label') == 'Antonym':
#                 scores.append(-1)
#             elif edge.get('rel', {}).get('label') in ['RelatedTo', 'IsA', 'PartOf', 'HasA', 'UsedFor', 'CapableOf']:
#                 scores.append(1)
#             else:
#                 scores.append(0)
#     return sum(scores) / len(scores) if scores else 0

def extract_features(tweet):
    cleaned_tweet = clean_text(tweet)
    sentiment_scores = split_and_analyze_sentiment(cleaned_tweet)
    # common_sense_score = get_common_sense_score(cleaned_tweet)
    
    features = {
        'sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else 0,
        'sentiment_stddev': np.std(sentiment_scores) if sentiment_scores else 0,
        # 'common_sense_score': common_sense_score
    }
    return features

# Extract features for the dataset
features_df = pd.DataFrame([extract_features(tweet) for tweet in df['tweet']])

X = features_df
y = df['sarcastic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
model = Pipeline([
    ('clf', LogisticRegression())
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

def predict_sarcasm(tweet):
    features = extract_features(tweet)
    features_df = pd.DataFrame([features])
    return model.predict(features_df)[0]

# Test the model with new tweets
test_tweet = "This is just amazing, I love waiting in line for hours"
print(f"Tweet: {test_tweet}\nPredicted Label: {predict_sarcasm(test_tweet)}")


Accuracy: 0.7161383285302594
Confusion Matrix:
 [[497   0]
 [197   0]]
Classification Report:
                precision    recall  f1-score   support

non-sarcastic       0.72      1.00      0.83       497
    sarcastic       0.00      0.00      0.00       197

     accuracy                           0.72       694
    macro avg       0.36      0.50      0.42       694
 weighted avg       0.51      0.72      0.60       694

Tweet: This is just amazing, I love waiting in line for hours.
Predicted Label: non-sarcastic


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
print(df.head())

   id                                              tweet  sarcastic  \
0   0  the only thing i got from college is a caffein...  sarcastic   
1   1  i love it when professors draw a big question ...  sarcastic   
2   2  remember the hundred emails from companies whe...  sarcastic   
3   3  today my poppop told me i was not “forced” to ...  sarcastic   
4   4  volphancarol littlewhitty mysticalmanatee i di...  sarcastic   

                                            rephrase  sarcasm  irony  satire  \
0  College is really difficult, expensive, tiring...      0.0    1.0     0.0   
1  I do not like when professors don’t write out ...      1.0    0.0     0.0   
2  I, at the bare minimum, wish companies actuall...      0.0    1.0     0.0   
3  Today my pop-pop told me I was not "forced" to...      1.0    0.0     0.0   
4  I would say Ted Cruz is an asshole and doesn’t...      1.0    0.0     0.0   

   understatement  overstatement  rhetorical_question  
0             0.0            0.0    

In [15]:
import pandas as pd
import numpy as np
import re
import string
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Map numeric labels to text labels
label_mapping = {0: 'non-sarcastic', 1: 'sarcastic'}
df['sarcastic'] = df['sarcastic'].map(label_mapping)

def get_sentiment_features(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

def split_and_analyze_sentiment(tweet):
    parts = re.split(r'[.,;!?]', tweet)
    sentiment_scores = [get_sentiment_features(part) for part in parts if part.strip()]
    return sentiment_scores

def check_contradictions(tweet):
    contradictions = [
        ("love", "hate"),
        ("happy", "sad"),
        ("good", "bad"),
        ("amazing", "terrible"),
        ("excited", "bored")
    ]
    
    for word1, word2 in contradictions:
        if word1 in tweet and word2 in tweet:
            return -1
    return 1

def extract_features(tweet):
    cleaned_tweet = clean_text(tweet)
    sentiment_scores = split_and_analyze_sentiment(cleaned_tweet)
    common_sense_score = check_contradictions(cleaned_tweet)
    
    features = {
        'sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else 0,
        'sentiment_stddev': np.std(sentiment_scores) if sentiment_scores else 0,
        'common_sense_score': common_sense_score
    }
    return features, sentiment_scores

# Extract features for the dataset and store sentiment scores
features_list = []
sentiment_scores_list = []

for tweet in df['tweet']:
    features, sentiment_scores = extract_features(tweet)
    features_list.append(features)
    sentiment_scores_list.append(sentiment_scores)

features_df = pd.DataFrame(features_list)
df['sentiment_scores'] = sentiment_scores_list

X = features_df
y = df['sarcastic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
model = Pipeline([
    ('clf', LogisticRegression(max_iter=200))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print the sentiment scores for each tweet
for index, row in df.iterrows():
    print(f"Tweet: {row['tweet']}\nSentiment Scores: {row['sentiment_scores']}\n")

def predict_sarcasm(tweet):
    features, _ = extract_features(tweet)
    features_df = pd.DataFrame([features])
    return model.predict(features_df)[0]

# Test the model with new tweets
test_tweet = "This is just amazing, I love waiting in line for hours."
print(f"Tweet: {test_tweet}\nPredicted Label: {predict_sarcasm(test_tweet)}")


Accuracy: 0.7161383285302594
Confusion Matrix:
 [[497   0]
 [197   0]]
Classification Report:
                precision    recall  f1-score   support

non-sarcastic       0.72      1.00      0.83       497
    sarcastic       0.00      0.00      0.00       197

     accuracy                           0.72       694
    macro avg       0.36      0.50      0.42       694
 weighted avg       0.51      0.72      0.60       694

Tweet: the only thing i got from college is a caffeine addiction
Sentiment Scores: [0.0]

Tweet: i love it when professors draw a big question mark next to my answer on an exam because i’m always like yeah i don’t either ¯ツ¯
Sentiment Scores: [0.16666666666666666]

Tweet: remember the hundred emails from companies when covid started getting real i’ve gotten three in regards to support for protests and only savagexfenty shared helpful links and actually said black lives matter we love capitalism 🥰🙌🏼
Sentiment Scores: [0.10666666666666666]

Tweet: today my poppop told

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Tweet: if you dont like lemonade youre an extremist 🍋
Sentiment Scores: [0.0]

Tweet: agreeing with the right to dunk on the centre is the real dialectical materialism
Sentiment Scores: [0.24285714285714285]

Tweet: my favorite gay drama is the social network
Sentiment Scores: [0.3166666666666667]

Tweet: texmex restaurants that stop serving breakfast tacos after  am are the reason i have trust issues
Sentiment Scores: [0.0]

Tweet: i love giving a lecture to grad students on a topic i dont fully understand at am lt i need a coffee lmao
Sentiment Scores: [0.55]

Tweet: woke up at  am with a mad craving for pickles i guess that means im expecting
Sentiment Scores: [-0.625]

Tweet: babe stop i’m about to gleek
Sentiment Scores: [0.0]

Tweet: izzychari mikeduncan “please get a free shot or take a once weekly nose swab we will literally pay you to take time off to get the shot” czar nicholas king george charles i and so on
Sentiment Scores: [0.4]

Tweet: high stakes legos
Sentiment Scores:

In [1]:
import pandas as pd
import numpy as np
import re
import string
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Map numeric labels to text labels
label_mapping = {0: 'non-sarcastic', 1: 'sarcastic'}
df['sarcastic'] = df['sarcastic'].map(label_mapping)

def get_sentiment_score_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(text)
    return score['compound']

def split_and_analyze_sentiment(tweet):
    parts = re.split(r'[.,;!?]', tweet)
    sentiment_scores = [get_sentiment_score_vader(part) for part in parts if part.strip()]
    return sentiment_scores

def check_sentiment_contradiction(sentiment_scores):
    if len(sentiment_scores) < 2:
        return 0  # Not enough data to determine contradiction
    
    min_sentiment = min(sentiment_scores)
    max_sentiment = max(sentiment_scores)
    
    # Adjust the threshold based on empirical results
    return 1 if abs(max_sentiment - min_sentiment) > 0.3 else 0

def extract_features(tweet):
    cleaned_tweet = clean_text(tweet)
    sentiment_scores = split_and_analyze_sentiment(cleaned_tweet)
    contradiction_score = check_sentiment_contradiction(sentiment_scores)
    
    features = {
        'sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else 0,
        'sentiment_stddev': np.std(sentiment_scores) if sentiment_scores else 0,
        'contradiction_score': contradiction_score
    }
    return features, sentiment_scores

# Extract features for the dataset and store sentiment scores
features_list = []
sentiment_scores_list = []

for tweet in df['tweet']:
    features, sentiment_scores = extract_features(tweet)
    features_list.append(features)
    sentiment_scores_list.append(sentiment_scores)

features_df = pd.DataFrame(features_list)
df['sentiment_scores'] = sentiment_scores_list

X = features_df
y = df['sarcastic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
model = Pipeline([
    ('clf', LogisticRegression(max_iter=200))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print the sentiment scores for each tweet
for index, row in df.iterrows():
    print(f"Tweet: {row['tweet']}\nSentiment Scores: {row['sentiment_scores']}\n")

def predict_sarcasm(tweet):
    features, _ = extract_features(tweet)
    features_df = pd.DataFrame([features])
    return model.predict(features_df)[0]

# Test the model with new tweets
test_tweet = "This is just amazing, I love waiting in line for hours."
print(f"Tweet: {test_tweet}\nPredicted Label: {predict_sarcasm(test_tweet)}")


Accuracy: 0.7161383285302594
Confusion Matrix:
 [[497   0]
 [197   0]]
Classification Report:
                precision    recall  f1-score   support

non-sarcastic       0.72      1.00      0.83       497
    sarcastic       0.00      0.00      0.00       197

     accuracy                           0.72       694
    macro avg       0.36      0.50      0.42       694
 weighted avg       0.51      0.72      0.60       694

Tweet: the only thing i got from college is a caffeine addiction
Sentiment Scores: [0.0]

Tweet: i love it when professors draw a big question mark next to my answer on an exam because i’m always like yeah i don’t either ¯ツ¯
Sentiment Scores: [0.836]

Tweet: remember the hundred emails from companies when covid started getting real i’ve gotten three in regards to support for protests and only savagexfenty shared helpful links and actually said black lives matter we love capitalism 🥰🙌🏼
Sentiment Scores: [0.9559]

Tweet: today my poppop told me i was not “forced” to g

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Tweet: dominiccummings sack him or better yet charge him
Sentiment Scores: [0.4404]

Tweet: plt slamming their brand name on everything winds me up you’ll find something really nice but it’s got ‘pretty little thing’ written in caps all over the front 😩
Sentiment Scores: [-0.154]

Tweet: meh…drakes album don’t feel like there’s a song on there that i wanna listen to twice maybe it will grow on me 🤷🏼‍♀️
Sentiment Scores: [0.3612]

Tweet: here we go again we are expecting our third baby is due feb  absolutely terrified of having  under  and a  year old but we move baby this is the last addition to the family so going to enjoy this for the very last time ❤️👶🏽
Sentiment Scores: [0.4676]

Tweet: asdaserviceteam imagine your delivery being  hours late and imagine calling up your service team only for them to hang up at pm coincidentally the same time the office closes but it’s okay my £ delivery fee is being refunded though 👊🏻
Sentiment Scores: [0.3291]

Tweet: asdaserviceteam imagine your d

In [25]:
pip install vaderSentiment


Defaulting to user installation because normal site-packages is not writeable
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/126.0 kB ? eta -:--:--
   ------ -------------------------------- 20.5/126.0 kB 330.3 kB/s eta 0:00:01
   ------------ -------------------------- 41.0/126.0 kB 393.8 kB/s eta 0:00:01
   -------------------------------------- 126.0/126.0 kB 925.7 kB/s eta 0:00:00
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install transformers torch


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.43.1-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.7 kB ? eta -:--:--
     --------- ------------------------------ 10.2/43.7 kB ? eta -:--:--
     ----------------- -------------------- 20.5/43.7 kB 165.2 kB/s eta 0:00:01
     -------------------------------------- 43.7/43.7 kB 267.3 kB/s eta 0:00:00
Collecting torch
  Using cached torch-2.3.1-cp312-cp312-win_amd64.whl.metadata (26 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.24.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Us




   ---------------------- ---------------- 93.8/159.7 MB 514.7 kB/s eta 0:02:09
   ---------------------- ---------------- 93.8/159.7 MB 514.3 kB/s eta 0:02:09
   ---------------------- ---------------- 93.8/159.7 MB 513.5 kB/s eta 0:02:09
   ---------------------- ---------------- 93.8/159.7 MB 513.1 kB/s eta 0:02:09
   ---------------------- ---------------- 93.9/159.7 MB 511.9 kB/s eta 0:02:09
   ---------------------- ---------------- 93.9/159.7 MB 510.7 kB/s eta 0:02:09
   ---------------------- ---------------- 93.9/159.7 MB 511.5 kB/s eta 0:02:09
   ---------------------- ---------------- 93.9/159.7 MB 510.7 kB/s eta 0:02:09
   ---------------------- ---------------- 93.9/159.7 MB 510.7 kB/s eta 0:02:09
   ---------------------- ---------------- 94.0/159.7 MB 509.1 kB/s eta 0:02:10
   ---------------------- ---------------- 94.0/159.7 MB 507.9 kB/s eta 0:02:10
   ---------------------- ---------------- 94.0/159.7 MB 509.9 kB/s eta 0:02:09
   ---------------------- -------------

In [1]:
import pandas as pd
import numpy as np
import re
import string
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Map numeric labels to text labels
label_mapping = {0: 'non-sarcastic', 1: 'sarcastic'}
df['sarcastic'] = df['sarcastic'].map(label_mapping)

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Create a pipeline for sentiment analysis
nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# Tokenize and encode the tweets
def tokenize_tweets(tweets):
    return tokenizer(tweets.tolist(), padding=True, truncation=True, return_tensors='pt')

# Prepare the data for training
X = tokenize_tweets(df['tweet'])
y = df['sarcastic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for BERT model
pipeline_model = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Train the model (BERT-based model is generally not trained from scratch in this manner)
# Predictions should use pre-trained model
def predict_sarcasm(tweet):
    result = pipeline_model(tweet)
    return result[0]['label']

# Test the model with new tweets
test_tweet = "This is just amazing, I love waiting in line for hours."
print(f"Tweet: {test_tweet}\nPredicted Label: {predict_sarcasm(test_tweet)}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Found input variables with inconsistent numbers of samples: [3, 3466]

In [5]:
import os
import pandas as pd
import numpy as np
import re
import string
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Disable the symlink warning
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Map numeric labels to text labels
label_mapping = {0: 'non-sarcastic', 1: 'sarcastic'}
df['sarcastic'] = df['sarcastic'].map(label_mapping)

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Create a pipeline for sentiment analysis
nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# Tokenize and encode the tweets
def tokenize_tweets(tweets):
    return tokenizer(tweets, padding=True, truncation=True, return_tensors='pt')

# Prepare the data for training
X = df['tweet'].tolist()
y = df['sarcastic'].apply(lambda x: 1 if x == 'sarcastic' else 0).tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize and encode the tweets
X_train_tokenized = tokenize_tweets(X_train)
X_test_tokenized = tokenize_tweets(X_test)

# Define a function to predict sarcasm
def predict_sarcasm(tweet):
    inputs = tokenizer(tweet, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=-1)
    return 'sarcastic' if predictions.item() == 1 else 'non-sarcastic'

# Evaluate the model
y_pred = [predict_sarcasm(tweet) for tweet in X_test]
y_test_labels = ['sarcastic' if label == 1 else 'non-sarcastic' for label in y_test]

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test_labels, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test_labels, y_pred))
print("Classification Report:\n", classification_report(y_test_labels, y_pred))

# Test the model with new tweets
test_tweet = "This is just amazing, I love waiting in line for hours."
print(f"Tweet: {test_tweet}\nPredicted Label: {predict_sarcasm(test_tweet)}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.28530259365994237
Confusion Matrix:
 [[  1 496]
 [  0 197]]
Classification Report:
                precision    recall  f1-score   support

non-sarcastic       1.00      0.00      0.00       497
    sarcastic       0.28      1.00      0.44       197

     accuracy                           0.29       694
    macro avg       0.64      0.50      0.22       694
 weighted avg       0.80      0.29      0.13       694

Tweet: This is just amazing, I love waiting in line for hours.
Predicted Label: sarcastic


In [7]:
import os
import pandas as pd
import numpy as np
import re
import string
import warnings
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Suppress specific warnings
warnings.filterwarnings('ignore', message='.*gamma.*')
warnings.filterwarnings('ignore', message='.*beta.*')

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Map numeric labels to text labels
label_mapping = {0: 'non-sarcastic', 1: 'sarcastic'}
df['sarcastic'] = df['sarcastic'].map(label_mapping)

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Prepare the data for training
X = df['tweet'].tolist()
y = df['sarcastic'].apply(lambda x: 1 if x == 'sarcastic' else 0).tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize and encode the tweets
def tokenize_tweets(tweets):
    return tokenizer(tweets, padding=True, truncation=True, return_tensors='pt')

train_encodings = tokenize_tweets(X_train)
test_encodings = tokenize_tweets(X_test)

class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SarcasmDataset(train_encodings, y_train)
test_dataset = SarcasmDataset(test_encodings, y_test)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Test the model with new tweets
test_tweet = "This is just amazing, I love waiting in line for hours."
inputs = tokenizer(test_tweet, return_tensors='pt', truncation=True, padding=True)
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
predicted_label = 'sarcastic' if predictions.item() == 1 else 'non-sarcastic'
print(f"Tweet: {test_tweet}\nPredicted Label: {predicted_label}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [11]:
pip install accelerate -U


Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.33.0-py3-none-any.whl (315 kB)
   ---------------------------------------- 0.0/315.1 kB ? eta -:--:--
   ----- --------------------------------- 41.0/315.1 kB 991.0 kB/s eta 0:00:01
   -------------------------------- ------- 256.0/315.1 kB 4.0 MB/s eta 0:00:01
   ---------------------------------------- 315.1/315.1 kB 3.3 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.33.0
Note: you may need to restart the kernel to use updated packages.




In [13]:
pip install transformers[torch]


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import pandas as pd
import numpy as np
import re
import string
import warnings
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Suppress specific warnings
warnings.filterwarnings('ignore', message='.*gamma.*')
warnings.filterwarnings('ignore', message='.*beta.*')

# Load the dataset
df = pd.read_csv('Train_Dataset.csv')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Map numeric labels to text labels
label_mapping = {0: 'non-sarcastic', 1: 'sarcastic'}
df['sarcastic'] = df['sarcastic'].map(label_mapping)

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Prepare the data for training
X = df['tweet'].tolist()
y = df['sarcastic'].apply(lambda x: 1 if x == 'sarcastic' else 0).tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize and encode the tweets
def tokenize_tweets(tweets):
    return tokenizer(tweets, padding=True, truncation=True, return_tensors='pt')

train_encodings = tokenize_tweets(X_train)
test_encodings = tokenize_tweets(X_test)

class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SarcasmDataset(train_encodings, y_train)
test_dataset = SarcasmDataset(test_encodings, y_test)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Test the model with new tweets
test_tweet = "This is just amazing, I love waiting in line for hours."
inputs = tokenizer(test_tweet, return_tensors='pt', truncation=True, padding=True)
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
predicted_label = 'sarcastic' if predictions.item() == 1 else 'non-sarcastic'
print(f"Tweet: {test_tweet}\nPredicted Label: {predicted_label}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5774,0.61658,0.716138,0.0,0.0,0.0
2,0.5077,0.605368,0.724784,0.295203,0.540541,0.203046
3,0.353,0.756306,0.733429,0.385382,0.557692,0.294416


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Tweet: This is just amazing, I love waiting in line for hours.
Predicted Label: sarcastic


In [3]:
pip install git+https://github.com/atcbosselut/comet-commonsense.git

Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/atcbosselut/comet-commonsense.git
  Cloning https://github.com/atcbosselut/comet-commonsense.git to c:\users\hp\appdata\local\temp\pip-req-build-bafbtt3t
  Resolved https://github.com/atcbosselut/comet-commonsense.git to commit beb6b55c38f1bf8b240283be83851b972453de4b



  Running command git clone --filter=blob:none --quiet https://github.com/atcbosselut/comet-commonsense.git 'C:\Users\HP\AppData\Local\Temp\pip-req-build-bafbtt3t'
ERROR: git+https://github.com/atcbosselut/comet-commonsense.git does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.
